In [31]:
from google.colab import drive

In [32]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
!ls /content/drive/MyDrive/4sq/input/

lgb_012.lgb  pairs.csv		    train_data1.csv  train_data5.csv
lgb_01.lgb   sample_submission.csv  train_data2.csv  tv_ids_d.npy
lgb_02.lgb   test.csv		    train_data3.csv
lgb_12.lgb   train.csv		    train_data4.csv


In [34]:
! pip3 install Levenshtein
! pip3 install pykakasi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [35]:
## Imports
import warnings
warnings.filterwarnings('ignore')

import os
import gc
import time
import random
import Levenshtein
import difflib
import multiprocessing
import pandas as pd
import numpy as np
import lightgbm as lgb
import pykakasi
from collections import Counter
from tqdm.auto import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [63]:
## Parameters
is_debug = False
#is_debug = True

SEED = 2022
#num_neighbors = 20
num_neighbors = 30

# 0 or 1, 2種類のデータセットを用意する
#data_split = 0 
data_split = 1


num_split = 5
# featureに使うcolums
# city, zip, phoneはvec変換せずに使う？
feat_columns = ['dist', 'name', 'address', 'city', 'state', 'zip', 'url', 'phone', 'categories', 'country']
# vectorに変換するcolumns
vec_columns = ['name', 'categories', 'address', 'state', 'url', 'country']
# simple recallで使うカラム
rec_columns = ['name', 'address', 'categories', 'address', 'phone']
# cat boostでcategorical featureとして使う
cat_features = ['country']

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)

In [64]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [65]:
%%cython
# 最長共通部分列問題
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [66]:
# get manhattan distance
def manhattan(lat1, long1, lat2, long2):
    return np.abs(lat2 - lat1) + np.abs(long2 - long1)

# get haversine distance
def vectorized_haversine(lats1, lats2, longs1, longs2):
    radius = 6371
    dlat=np.radians(lats2 - lats1)
    dlon=np.radians(longs2 - longs1)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lats1)) \
        * np.cos(np.radians(lats2)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = radius * c
    return d

In [67]:
# knnを使わない方法
# idのカラムとまったく同じ値を持つものがれば追加する
def recall_simple(df):
    # 1でもいける？
    threshold = 2
    #threshold = 1
    
    val2id_d = {}
    # 特定カラムでgroupbyしてidを保存
    for col in rec_columns:
        temp_df = df[['id', col]]
        temp_df[col] = temp_df[col].str.lower()
        val2id = temp_df.groupby(col)['id'].apply(set).to_dict()
        val2id_d[col] = val2id
        del val2id
    
    cus_ids = []
    match_ids = []
    for vals in tqdm(df[rec_columns + ['id']].fillna('null').values):
        # valsの最後がid
        cus_id = vals[-1]
        match_id = []
        
        rec_match_count = []
        for i in range(len(rec_columns)):
            col = rec_columns[i]
            
            if vals[i] != 'null':
                rec_match_count += list(val2id_d[col][vals[i].lower()])
        rec_match_count = dict(Counter(rec_match_count))
        # 特定の値を持つものが2つ以上あれば保存する
        # 自分自身もあるから1以上
        for k, v in rec_match_count.items():
            if v > threshold:
                match_id.append(k)
        # match_idの数だけseedとなるidを複製して用意する
        cus_ids += [cus_id] * len(match_id)
        match_ids += match_id
    
    train_df = pd.DataFrame()
    train_df['id'] = cus_ids
    train_df['match_id'] = match_ids
    train_df = train_df.drop_duplicates()
    del cus_ids, match_ids
    
    num_data = len(train_df)
    num_data_per_id = num_data / train_df['id'].nunique()
    print('Num of data: %s' % num_data)
    print('Num of data per id: %s' % num_data_per_id)
    
    return train_df

In [68]:
def recall_knn(df, Neighbors = 10):
    print('Start knn grouped by country')
    # 国ごと
    train_df_country = []
    # tqdmはprogressバーを表示するためだけ、forで回すことでkeyとdfを表示可能
    # 国ごとにknnを計算してneighborsを求める
    for country, country_df in tqdm(df.groupby('country')):
        country_df = country_df.reset_index(drop = True)
        # Neighborsに満たない数しかない国は最小値を使う
        neighbors = min(len(country_df), Neighbors)
        # radianに直す
        country_df[['latitude', 'longitude']] = np.deg2rad(country_df[['latitude', 'longitude']])

        neighbors_country = neighbors
        knn = KNeighborsRegressor(n_neighbors = neighbors_country,
                                    metric = 'haversine', # 緯度、経度から距離を計算するメトリック
                                    n_jobs = -1) # すべてのprocessorを使う
        # 緯度、経度が近い順にneighborsを計算
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude', 'longitude']], 
                                        return_distance = True)

        for k in range(neighbors):            
            cur_df = country_df[['id']] # 現在のdfのid
            cur_df['match_id'] = country_df['id'].values[nears[:, k]] # k番目に近いdfのid
            cur_df['kdist_country'] = dists[:, k] # k番目までの距離
            cur_df['kneighbors_country'] = k # kそのもの
            
            train_df_country.append(cur_df) # まとめてappend
    train_df_country = pd.concat(train_df_country) # dfにする
    
    print('Start knn')
    # 国の区別をしない
    train_df = []
    #knn = NearestNeighbors(n_neighbors = Neighbors)
    knn = KNeighborsRegressor(n_neighbors = Neighbors,
                                    metric = 'haversine', 
                                    n_jobs = -1) 
    df[['latitude', 'longitude']] = np.deg2rad(df[['latitude', 'longitude']]) # 追加
    knn.fit(df[['latitude','longitude']], df.index)
    dists, nears = knn.kneighbors(df[['latitude','longitude']])
    
    for k in range(Neighbors):            
        cur_df = df[['id']]
        cur_df['match_id'] = df['id'].values[nears[:, k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    
    train_df = pd.concat(train_df)

    # 国別、全体をouter joinして両方の情報を反映させる
    train_df = train_df.merge(train_df_country,
                                 on = ['id', 'match_id'],
                                 how = 'outer')
    del train_df_country
    
    return train_df

In [69]:
def add_features(df):
    # feature対象となるcolumnsのloop    
    for col in tqdm(feat_columns):    
        if col == 'dist':
            lat = data.loc[df['id']]['latitude'].values
            match_lat = data.loc[df['match_id']]['latitude'].values
            lon = data.loc[df['id']]['longitude'].values
            match_lon = data.loc[df['match_id']]['longitude'].values
            # 緯度、軽度からいろんな定義の距離を追加する
            df['latdiff'] = np.abs(lat - match_lat)
            df['londiff'] = np.abs(lon - match_lon)
            # マンハッタン距離 (縦横)
            df['manhattan'] = manhattan(lat, lon, match_lat, match_lon)
            # ユークリッド距離 (最短距離)
            df['euclidean'] = (df['latdiff'] ** 2 + df['londiff'] ** 2) ** 0.5
            # 球面上の距離
            df['haversine'] = vectorized_haversine(lat, match_lat, lon, match_lon)
            # 緯度、経度そのものも素性
            df['latitude_x'] = lat 
            df['latitude_y'] = match_lat 
            df['longitude_x'] = lon
            df['longitude_y'] = match_lon
            continue
        
        col_values = data.loc[df['id']][col].values.astype(str)
        matcol_values = data.loc[df['match_id']][col].values.astype(str)

        # vector feature
        if col in vec_columns:
            tv_fit = tfidf_d[col]
            # idからindexを取得
            indexs = [id2index_d[i] for i in df['id']]
            match_indexs = [id2index_d[i] for i in df['match_id']] 
            # idからtf-idfで計算したvectorを取得し、内積をsimilarityとして素性に追加                  
            df[f'{col}_sim'] = tv_fit[indexs].multiply(tv_fit[match_indexs]).sum(axis = 1).A.ravel()
        # strに揃える
        col_values = data.loc[df['id']][col].values.astype(str)
        matcol_values = data.loc[df['match_id']][col].values.astype(str)
        geshs = []
        levens = []
        jaros = []
        lcss = []
        for s, match_s in zip(col_values, matcol_values):
            if s != 'nan' and match_s != 'nan':       
                # 部分文字列類似度             
                geshs.append(difflib.SequenceMatcher(None, s, match_s).ratio())
                # レーベンシュタイン距離
                levens.append(Levenshtein.distance(s, match_s))
                # ジャロ・ウィンクラー類似度
                jaros.append(Levenshtein.jaro_winkler(s, match_s))
                # LCS(最長共通部分列)の計算
                lcss.append(LCS(str(s), str(match_s)))
            else:
                geshs.append(np.nan)
                levens.append(np.nan)
                jaros.append(np.nan)
                lcss.append(np.nan)
        # columnごとに以下の4つを加える
        df[f'{col}_gesh'] = geshs
        df[f'{col}_leven'] = levens
        df[f'{col}_jaro'] = jaros
        df[f'{col}_lcs'] = lcss

        # phone, zipは長さ固定なので除く
        if col not in ['phone', 'zip']:
            df[f'{col}_len'] = list(map(len, col_values))
            df[f'match_{col}_len'] = list(map(len, matcol_values)) 
            df[f'{col}_len_diff'] = np.abs(df[f'{col}_len'] - df[f'match_{col}_len'])
            # レーベンシュタイン距離を標準化したもの
            df[f'{col}_nleven'] = df[f'{col}_leven'] / df[[f'{col}_len', f'match_{col}_len']].max(axis = 1)
            # 最長共通文字列の比
            df[f'{col}_nlcsk'] = df[f'{col}_lcs'] / df[f'match_{col}_len']
            df[f'{col}_nlcs'] = df[f'{col}_lcs'] / df[f'{col}_len']
            # 長さそのものは素性としては役に立たなそうなので削る
            df = df.drop(f'{col}_len', axis = 1)
            df = df.drop(f'match_{col}_len', axis = 1)
            gc.collect()
        # add categorical features
        for cat in cat_features:
            df[f'{cat}_x'] = data.loc[df['id']][cat].values
            df[f'{cat}_y'] = data.loc[df['match_id']][cat].values
    return df

In [70]:
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

In [71]:
def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def analysis(df):
    print('Num of data: %s' % len(df))
    print('Num of unique id: %s' % df['id'].nunique())
    print('Num of unique poi: %s' % df['point_of_interest'].nunique())
    
    poi_grouped = df.groupby('point_of_interest')['id'].count().reset_index()
    print('Mean num of unique poi: %s' % poi_grouped['id'].mean())

In [72]:
# https://www.kaggle.com/code/nlztrk/public-0-861-pykakasi-radian-coordinates
# 日本語の場合はアルファベットにする
def convert_japanese_alphabet(df: pd.DataFrame):
    kakasi = pykakasi.kakasi()
    kakasi.setMode('H', 'a')  # Convert Hiragana into alphabet
    kakasi.setMode('K', 'a')  # Convert Katakana into alphabet
    kakasi.setMode('J', 'a')  # Convert Kanji into alphabet
    conversion = kakasi.getConverter()

    def convert(row):
        for column in ["name", "address", "city", "state"]:
            try:
                row[column] = conversion.do(row[column])
            except:
                pass
        return row

    df[df["country"] == "JP"] = df[df["country"] == "JP"].progress_apply(convert, axis=1)
    return df

In [73]:
## Data load
data_root = '/content/drive/MyDrive/4sq/input/'
data = pd.read_csv(os.path.join(data_root, 'train.csv'))

if is_debug:
    #data = data.sample(n = 10000, random_state = SEED)
    data = data.sample(n = 100000, random_state = SEED)
    data = data.reset_index(drop = True)

In [74]:
tqdm.pandas()

In [75]:
data = convert_japanese_alphabet(data)

  0%|          | 0/70032 [00:00<?, ?it/s]

In [76]:
data.head()

Unnamed: 0,id,name,latitude,longitude,address,city,state,zip,country,url,phone,categories,point_of_interest
0,E_000001272c6c5d,Café Stad Oudenaarde,50.859975,3.634196,Abdijstraat,Nederename,Oost-Vlaanderen,9700.0,BE,,,Bars,P_677e840bb6fc7e
1,E_000002eae2a589,Carioca Manero,-22.907225,-43.178244,,,,,BR,,,Brazilian Restaurants,P_d82910d8382a83
2,E_000007f24ebc95,ร้านตัดผมการาเกด,13.780813,100.4849,,,,,TH,,,Salons / Barbershops,P_b1066599e78477
3,E_000008a8ba4f48,Turkcell,37.84451,27.844202,Adnan Menderes Bulvarı,,,,TR,,,Mobile Phone Shops,P_b2ed86905a4cd3
4,E_00001d92066153,Restaurante Casa Cofiño,43.338196,-4.326821,,Caviedes,Cantabria,,ES,,,Spanish Restaurants,P_809a884d4407fb


In [77]:
data.count()

id                   1138812
name                 1138811
latitude             1138812
longitude            1138812
address               742191
city                  839623
state                 718226
zip                   543386
country              1138801
url                   267724
phone                 342855
categories           1040505
point_of_interest    1138812
dtype: int64

### Data Split

In [78]:
# PoEが同じグループになるようにGroupKFoldする
kf = GroupKFold(n_splits=2)

# 引数はX, y, groupを表す, 結果はindexで帰ってくるのでdata.locでアクセスする
for i, (trn_idx, val_idx) in enumerate(kf.split(data, data['point_of_interest'], data['point_of_interest'])):
    data.loc[val_idx, 'set'] = i

print('Num of train data: %s' % len(data))
print(data['set'].value_counts())

train_number = 1
valid_number = 0
if data_split == 1:
  train_number = 0
  valid_number = 1

valid_data = data[data['set'] == valid_number]
train_data = data[data['set'] == train_number]

print('Train data: ')
analysis(train_data)
print('Valid data: ')
analysis(valid_data)

train_poi = train_data['point_of_interest'].unique().tolist()
valid_poi = valid_data['point_of_interest'].unique().tolist()

print(set(train_poi) & set(valid_poi))

train_ids = train_data['id'].unique().tolist()
valid_ids = valid_data['id'].unique().tolist()
      
print(set(train_ids) & set(valid_ids)) # うまく分割できていれば空になる
      
# idの分割設定をdictで保存
tv_ids_d = {}
tv_ids_d['train_ids'] = train_ids
tv_ids_d['valid_ids'] = valid_ids

np.save('tv_ids_d.npy', tv_ids_d) # 外部に保存

del train_data, valid_data
gc.collect()

data = data.set_index('id')
data = data.loc[tv_ids_d['train_ids']] # train部分だけを取り出す
data = data.reset_index() # index振り直し

Num of train data: 1138812
1.0    569406
0.0    569406
Name: set, dtype: int64
Train data: 
Num of data: 569406
Num of unique id: 569406
Num of unique poi: 369985
Mean num of unique poi: 1.5389975269267673
Valid data: 
Num of data: 569406
Num of unique id: 569406
Num of unique poi: 369987
Mean num of unique poi: 1.5389892077289202
set()
set()


## Train data generated by knn

In [79]:
# special process
# 小文字に揃える
def get_lower(x):
    try:
        return x.lower()
    except:
        return x
for col in data.columns:
    if data[col].dtype == object and col != 'id':
        data[col] = data[col].apply(get_lower)

In [80]:
id2index_d = dict(zip(data['id'].values, data.index)) # idとindexの対応させてdictにする

# tfidfを用いて言語をベクトルに変換, 後にsimilarityの計算に使う
tfidf_d = {}
for col in vec_columns:
    tfidf = TfidfVectorizer()
    tv_fit = tfidf.fit_transform(data[col].fillna('nan'))
    tfidf_d[col] = tv_fit

train_data_simple = recall_simple(data)
train_data = recall_knn(data, num_neighbors)

print('train data by knn: %s' % len(train_data))
train_data = train_data.merge(train_data_simple,
                             on = ['id', 'match_id'],
                             how = 'outer')
del train_data_simple
gc.collect()

data = data.set_index('id')
ids = train_data['id'].tolist()
match_ids = train_data['match_id'].tolist()

poi = data.loc[ids]['point_of_interest'].values # seedのPoI
match_poi = data.loc[match_ids]['point_of_interest'].values # matchしたほうのPoE

train_data['label'] = np.array(poi == match_poi, dtype = np.int8) # 同じidでmatchするPoIがあればtrue
del poi, match_poi, ids, match_ids
gc.collect()

print('Num of unique id: %s' % train_data['id'].nunique())
print('Num of train data: %s' % len(train_data))
print('Pos rate: %s' % train_data['label'].mean())
print(train_data.sample(5))

  0%|          | 0/569406 [00:00<?, ?it/s]

Num of data: 1629688
Num of data per id: 4.371246177780162
Start knn grouped by country


  0%|          | 0/209 [00:00<?, ?it/s]

Start knn
train data by knn: 17166496
Num of unique id: 569406
Num of train data: 18352838
Pos rate: 0.05662170613613001
                        id          match_id     kdist  kneighbors  \
16343448  E_b3ce594283aaab  E_fbb72bc3f99131  0.002281        28.0   
8819167   E_7d24d0a16a92a0  E_f586fa549d7a1e  0.000025        15.0   
14894839  E_2889bc73adeab0  E_131ff81984887d  0.000110        26.0   
7432441   E_0d992c4213dcba  E_415a6816e1d753  0.000074        13.0   
100695    E_2d3b561f06aac1  E_2d3b561f06aac1  0.000000         0.0   

          kdist_country  kneighbors_country  label  
16343448       0.002281                28.0      0  
8819167        0.000025                15.0      0  
14894839       0.000110                26.0      0  
7432441        0.000074                13.0      0  
100695         0.000000                 0.0      1  


## Eval


In [81]:
data = data.reset_index() # indexがidになってるので数字を振り直す

In [82]:
id2poi = get_id2poi(data) # idからPoEを取得
poi2ids = get_poi2ids(data) # PoEからidのsetを取得

In [83]:
eval_df = pd.DataFrame()
eval_df['id'] = data['id'].unique().tolist()
eval_df['match_id'] = eval_df['id'] # scoreの形式を揃えるためにidとmatch_idを同じにしたものを用意
print('Unique id: %s' % len(eval_df))

Unique id: 569406


In [84]:
eval_df_ = train_data[train_data['label'] == 1][['id', 'match_id']] # kNNで計算したやつ
eval_df = pd.concat([eval_df, eval_df_]) # 自分自身のidのものとくっつける
eval_df = eval_df.groupby('id')['match_id'].apply(list).reset_index()
eval_df['matches'] = eval_df['match_id'].apply(lambda x: ' '.join(set(x)))
print('Unique id: %s' % len(eval_df))

Unique id: 569406


In [85]:
eval_df

Unnamed: 0,id,match_id,matches
0,E_000002eae2a589,"[E_000002eae2a589, E_000002eae2a589, E_e80db43...",E_e80db432029aea E_000002eae2a589
1,E_000007f24ebc95,"[E_000007f24ebc95, E_000007f24ebc95]",E_000007f24ebc95
2,E_000008a8ba4f48,"[E_000008a8ba4f48, E_000008a8ba4f48]",E_000008a8ba4f48
3,E_00001d92066153,"[E_00001d92066153, E_00001d92066153, E_7e0d8e9...",E_7e0d8e9138dd56 E_00001d92066153
4,E_000023d8f4be44,"[E_000023d8f4be44, E_000023d8f4be44, E_12453ef...",E_000023d8f4be44 E_12453effe251db
...,...,...,...
569401,E_ffff83a9496324,"[E_ffff83a9496324, E_ffff83a9496324]",E_ffff83a9496324
569402,E_ffff8cc1b92ab3,"[E_ffff8cc1b92ab3, E_ffff8cc1b92ab3]",E_ffff8cc1b92ab3
569403,E_ffff9509490675,"[E_ffff9509490675, E_ffff9509490675, E_8d4924d...",E_bc7f07ef18838d E_dfee4e9e388cad E_8d4924d430...
569404,E_ffffc572b4d35b,"[E_ffffc572b4d35b, E_ffffc572b4d35b, E_27bcc6f...",E_8081185a219dc3 E_ffffc572b4d35b E_27bcc6f6dd...


In [86]:
# knn -> ユークリッド距離（デフォ）, knn country -> Heavasine（ただしradでない）という変な実装
# recallの目安、もし全ての正解をカバーできていればこの値は1.0になる
# また、rankingの上限でもある
# num_neighbors = 10, score = 0.90793, Pos rate: 0.1174
# num_neighbors = 20, score = 0.92803, Pos rate: 0.0667
# num_neighbors = 30, score = 0.93773, Pos rate: 0.04687
# num_neighbors = 20, idだけ100 0.06670
# simple thre=1 0.93520 0.03394

# knn country rad変換 0.92719 Pos rate: 0.07579 なぜか少し下がる。。
# knn harvasine+rad変換  0.92618 0.079255
# 引き伸ばし追加 0.9276

iou_score = get_score(eval_df)
print('IoU score: %s' % iou_score)

IoU score: 0.9353242283413878


In [87]:
train_data

Unnamed: 0,id,match_id,kdist,kneighbors,kdist_country,kneighbors_country,label
0,E_000002eae2a589,E_000002eae2a589,0.0,0.0,0.0,0.0,1
1,E_000007f24ebc95,E_000007f24ebc95,0.0,0.0,0.0,0.0,1
2,E_000008a8ba4f48,E_000008a8ba4f48,0.0,0.0,0.0,0.0,1
3,E_00001d92066153,E_00001d92066153,0.0,0.0,0.0,0.0,1
4,E_000023d8f4be44,E_000023d8f4be44,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...
18352833,E_ffebf327be8677,E_fe3b25a6be0358,,,,,0
18352834,E_ffebf327be8677,E_f850acfc89e173,,,,,0
18352835,E_fff368a2c59546,E_de2bdda6d4977c,,,,,0
18352836,E_fff9ecf69690da,E_393d42bc0cdbce,,,,,1


In [222]:
# 定性チェック、recallで外したやつがどんなやつか
tmp_train = train_data.query('label==1 and id != match_id')[['id','match_id','label']].drop_duplicates()
tmp_train = tmp_train.rename(columns={'id': 'id_x'})
tmp_train = tmp_train.rename(columns={'match_id': 'id_y'})
#tmp_train.head()
all_join = pd.merge(data, data, on='point_of_interest', how='outer').query('id_x != id_y')
all_join = pd.merge(all_join, tmp_train, on=['id_x', 'id_y'], how='left')
all_join['diff_lat'] = abs(all_join['latitude_x'] - all_join['latitude_y'])
all_join['diff_lon'] = abs(all_join['longitude_x'] - all_join['longitude_y'])
neg = all_join.query('label != 1')
pos = all_join.query('label == 1')

In [183]:
for country, country_df in tqdm(data.groupby('country')):
  num1 = country_df['country'].count()
  num2 = country_df['point_of_interest'].nunique()
  print(country, num1, num2, num1 / num2)


  0%|          | 0/210 [00:00<?, ?it/s]

ad 8 7 1.1428571428571428
ae 1994 1358 1.4683357879234167
af 10 7 1.4285714285714286
ag 5 3 1.6666666666666667
ai 5 3 1.6666666666666667
al 29 20 1.45
am 74 51 1.4509803921568627
an 1 1 1.0
ao 7 7 1.0
aq 13 9 1.4444444444444444
ar 2765 1741 1.588167719701321
at 1309 949 1.3793466807165438
au 5264 3849 1.3676279553130684
aw 35 23 1.5217391304347827
ax 14 12 1.1666666666666667
az 207 116 1.7844827586206897
ba 56 43 1.302325581395349
bb 46 36 1.2777777777777777
bd 73 52 1.4038461538461537
be 12877 8277 1.5557569167572791
bf 6 3 2.0
bg 831 536 1.5503731343283582
bh 223 152 1.4671052631578947
bl 4 2 2.0
bm 12 7 1.7142857142857142
bn 369 213 1.732394366197183
bo 97 61 1.5901639344262295
bq 2 2 1.0
br 25454 19541 1.3025945448032343
bs 40 27 1.4814814814814814
bt 5 3 1.6666666666666667
bv 1 1 1.0
bw 11 8 1.375
by 922 614 1.501628664495114
bz 16 10 1.6
ca 5973 4301 1.3887468030690537
cd 6 4 1.5
ch 1091 769 1.4187256176853056
ci 6 5 1.2
cl 3952 3275 1.2067175572519084
cm 7 4 1.75
cn 3643 2312 1.

## Add features


In [88]:
count = 0
start_row = 0

data = data.set_index('id')
unique_id = train_data['id'].unique().tolist()

In [89]:
data

Unnamed: 0_level_0,name,latitude,longitude,address,city,state,zip,country,url,phone,categories,point_of_interest,set
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
E_000002eae2a589,carioca manero,-0.399807,-0.753603,,,,,br,,,brazilian restaurants,p_d82910d8382a83,0.0
E_000007f24ebc95,ร้านตัดผมการาเกด,0.240521,1.753792,,,,,th,,,salons / barbershops,p_b1066599e78477,0.0
E_000008a8ba4f48,turkcell,0.660511,0.485973,adnan menderes bulvarı,,,,tr,,,mobile phone shops,p_b2ed86905a4cd3,0.0
E_00001d92066153,restaurante casa cofiño,0.756394,-0.075517,,caviedes,cantabria,,es,,,spanish restaurants,p_809a884d4407fb,0.0
E_000023d8f4be44,island spa,0.253404,2.112175,"5th flr, newport mall, resorts world manila",pasay city,metro manila,,ph,,,spas,p_020de174484ec6,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
E_ffff83a9496324,restoran nasi kandar arraaziq,0.054871,1.775268,jalan barat,kuala lumpur,kl,53000,my,,,indian restaurants,p_a630bf50670209,0.0
E_ffff8cc1b92ab3,fawsley hall,0.911174,-0.020571,fawsley nr daventry,daventry,northants,nn11 3ba,gb,,,hotels,p_4c0ab0c9bb6a79,0.0
E_ffff9509490675,kelab rahman putra hills course,0.056166,1.772531,,,,,my,,,golf courses,p_1a01d5511f3f36,0.0
E_ffffc572b4d35b,i̇zmir adnan menderes havaalanı,0.670621,0.473732,,i̇zmir,,,tr,,,airport services,p_ae96252a6a9380,0.0


In [90]:
# idを5分割した数
num_split_id = len(unique_id) // num_split
# k loop : 1-5
for k in range(1, num_split + 1):
    print('Current split: %s' % k)
    end_row = start_row + num_split_id
    if k < num_split:
        cur_id = unique_id[start_row : end_row]
        cur_data = train_data[train_data['id'].isin(cur_id)]
    # 最後のloop
    else:
        cur_id = unique_id[start_row: ]
        cur_data = train_data[train_data['id'].isin(cur_id)]
    # 分割したデータ
    cur_data = add_features(cur_data)
    # 2つ素性を追加
    cur_data['kdist_diff'] = (cur_data['kdist'] - cur_data['kdist_country']) / cur_data['kdist_country']
    cur_data['kneighbors_mean'] = cur_data[['kneighbors', 'kneighbors_country']].mean(axis = 1)
    print(cur_data.shape)
    print(cur_data.sample(1))
    
    # 分割済みデータをcsvで吐き出す
    cur_data.to_csv(f'train_data_{data_split}_{k}.csv', index = False)    
    start_row = end_row
    count += len(cur_data)
    
    del cur_data
    gc.collect()
    
print(count)

Current split: 1


  0%|          | 0/10 [00:00<?, ?it/s]

(3656449, 90)
                       id          match_id     kdist  kneighbors  \
8046481  E_2190d43ddb38c2  E_a2715857841761  0.000167        14.0   

         kdist_country  kneighbors_country  label   latdiff   londiff  \
8046481       0.000167                14.0      0  0.000106  0.000133   

         manhattan  ...  country_gesh  country_leven  country_jaro  \
8046481   0.000239  ...           1.0            0.0           1.0   

         country_lcs  country_len_diff  country_nleven  country_nlcsk  \
8046481          2.0                 0             0.0            1.0   

         country_nlcs  kdist_diff  kneighbors_mean  
8046481           1.0         0.0             14.0  

[1 rows x 90 columns]
Current split: 2


  0%|          | 0/10 [00:00<?, ?it/s]

(3682688, 90)
                        id          match_id     kdist  kneighbors  \
16087774  E_40e24f9c77c166  E_6cce2e378c8077  0.001128        28.0   

          kdist_country  kneighbors_country  label   latdiff   londiff  \
16087774       0.001128                28.0      0  0.001112  0.000189   

          manhattan  ...  country_gesh  country_leven  country_jaro  \
16087774   0.001301  ...           1.0            0.0           1.0   

          country_lcs  country_len_diff  country_nleven  country_nlcsk  \
16087774          2.0                 0             0.0            1.0   

          country_nlcs  kdist_diff  kneighbors_mean  
16087774           1.0         0.0             28.0  

[1 rows x 90 columns]
Current split: 3


  0%|          | 0/10 [00:00<?, ?it/s]

(3678951, 90)
                        id          match_id     kdist  kneighbors  \
15107268  E_883537015cabc3  E_4317eab04e47e5  0.000025        26.0   

          kdist_country  kneighbors_country  label   latdiff   londiff  \
15107268       0.000025                26.0      0  0.000025  0.000007   

          manhattan  ...  country_gesh  country_leven  country_jaro  \
15107268   0.000032  ...           1.0            0.0           1.0   

          country_lcs  country_len_diff  country_nleven  country_nlcsk  \
15107268          2.0                 0             0.0            1.0   

          country_nlcs  kdist_diff  kneighbors_mean  
15107268           1.0         0.0             26.0  

[1 rows x 90 columns]
Current split: 4


  0%|          | 0/10 [00:00<?, ?it/s]

(3661690, 90)
                        id          match_id     kdist  kneighbors  \
11765647  E_a9be7fb3238202  E_2eeec2fb2b15d7  0.000024        20.0   

          kdist_country  kneighbors_country  label   latdiff   londiff  \
11765647       0.000024                20.0      0  0.000024  0.000002   

          manhattan  ...  country_gesh  country_leven  country_jaro  \
11765647   0.000025  ...           1.0            0.0           1.0   

          country_lcs  country_len_diff  country_nleven  country_nlcsk  \
11765647          2.0                 0             0.0            1.0   

          country_nlcs  kdist_diff  kneighbors_mean  
11765647           1.0         0.0             20.0  

[1 rows x 90 columns]
Current split: 5


  0%|          | 0/10 [00:00<?, ?it/s]

(3673060, 90)
                       id          match_id     kdist  kneighbors  \
8503610  E_ef08893ea7559f  E_d24584b788b471  0.000035        14.0   

         kdist_country  kneighbors_country  label   latdiff   londiff  \
8503610       0.000035                14.0      0  0.000024  0.000026   

         manhattan  ...  country_gesh  country_leven  country_jaro  \
8503610    0.00005  ...           1.0            0.0           1.0   

         country_lcs  country_len_diff  country_nleven  country_nlcsk  \
8503610          2.0                 0             0.0            1.0   

         country_nlcs  kdist_diff  kneighbors_mean  
8503610           1.0         0.0             14.0  

[1 rows x 90 columns]
18352838
