In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
## Imports
import warnings
warnings.filterwarnings('ignore')

import os
import gc
import time
import random
import difflib
import multiprocessing
import pandas as pd
import numpy as np
import lightgbm as lgb
from collections import Counter
from tqdm.auto import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [3]:
## Data load
data_root = '/content/drive/MyDrive/4sq/input/'
data = pd.read_csv(os.path.join(data_root, 'train.csv'))

#data = data.sample(n = 500000)
data = data.reset_index(drop = True)

In [4]:
all_join = pd.merge(data, data, on='point_of_interest', how='outer').query('id_x != id_y')

In [5]:
all_join['diff_lat'] = abs(all_join['latitude_x'] - all_join['latitude_y'])
all_join['diff_lon'] = abs(all_join['longitude_x'] - all_join['longitude_y'])

In [6]:
# get haversine distance
def vectorized_haversine(lats1, lats2, longs1, longs2):
    radius = 6371
    dlat=np.radians(lats2 - lats1)
    dlon=np.radians(longs2 - longs1)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lats1)) \
        * np.cos(np.radians(lats2)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = radius * c
    return d

In [7]:
def vectorized_lat(lats1, lats2, longs1, longs2):
    radius = 6371
    dlat=np.radians(lats2 - lats1)
    a = np.sin(dlat/2) * np.sin(dlat/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = radius * c
    return d

In [8]:
def vectorized_lon(lats1, lats2, longs1, longs2):
    radius = 6371
    dlon=np.radians(longs2 - longs1)
    a = np.cos(np.radians(lats1)) * np.cos(np.radians(lats2)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = radius * c
    return d

In [9]:
lat = all_join['latitude_x'].values
match_lat = all_join['latitude_y'].values
lon = all_join['longitude_x'].values
match_lon = all_join['longitude_y'].values
all_join['dlon'] = vectorized_lon(lat, match_lat, lon, match_lon)
all_join['dlat'] = vectorized_lat(lat, match_lat, lon, match_lon)

In [10]:
all_join

Unnamed: 0,id_x,name_x,latitude_x,longitude_x,address_x,city_x,state_x,zip_x,country_x,url_x,...,state_y,zip_y,country_y,url_y,phone_y,categories_y,diff_lat,diff_lon,dlon,dlat
1,E_000001272c6c5d,Café Stad Oudenaarde,50.859975,3.634196,Abdijstraat,Nederename,Oost-Vlaanderen,9700,BE,,...,,,BE,,,Bars,0.009383,0.001010,0.070858,1.043305
2,E_da7fa3963561f8,Café Oudenaarde,50.869358,3.635206,,,,,BE,,...,Oost-Vlaanderen,9700,BE,,,Bars,0.009383,0.001010,0.070858,1.043305
5,E_000002eae2a589,Carioca Manero,-22.907225,-43.178244,,,,,BR,,...,RJ,20040-901,BR,,,"Bars, Snack Places",0.000200,0.000177,0.018098,0.022227
6,E_e80db432029aea,Carioca Manero,-22.907025,-43.178067,Shopping Avenida Central,Rio de Janeiro,RJ,20040-901,BR,,...,,,BR,,,Brazilian Restaurants,0.000200,0.000177,0.018098,0.022227
11,E_00001d92066153,Restaurante Casa Cofiño,43.338196,-4.326821,,Caviedes,Cantabria,,ES,,...,Spain,39593,ES,,34942708046,Spanish Restaurants,0.000066,0.000104,0.008382,0.007372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3037950,E_ffe4a72f474af1,Učňovské kadeřnictví a holičství,50.097211,14.464821,Molákova 578/36,Praha,Hlavní město Praha,186 00,CZ,http://www.souhair.cz/provozovny.html,...,,,CZ,,,,0.000325,0.000237,0.016932,0.036123
3038539,E_ff3ac500130de1,東急バケーションズ伊豆高原,34.896200,139.094079,,伊東市,静岡県,413-0234,JP,,...,静岡県,413-0234,JP,,,"Resorts, Hotels",0.000524,0.000089,0.008101,0.058217
3038540,E_ff6935b7173ed6,東急バケーションズ伊豆高原,34.895676,139.093991,池614-91,伊東市,静岡県,413-0234,JP,,...,静岡県,413-0234,JP,,,Resorts,0.000524,0.000089,0.008101,0.058217
3038764,E_ff5e2880b229ab,Братья Караваевы,55.744240,37.603006,Малый Сухаревский пер.7,Москва,,,RU,,...,Moscow,,RU,http://karavaevi.ru/enter/,,Bakeries,0.026291,0.022947,1.435748,2.923480


### 緯度、経度ベース

In [47]:
print('full: lat=', all_join['diff_lat'].mean(), 'lon=', all_join['diff_lon'].mean())
print('lat<5: lat=', all_join.query('abs(latitude_x) < 5')['diff_lat'].mean(), 'lon=', all_join.query('abs(latitude_x) < 5')['diff_lon'].mean())
print('lat5-10: lat=', all_join.query('abs(latitude_x) < 10 and abs(latitude_x) > 5')['diff_lat'].mean(), 'lon=', all_join.query('abs(latitude_x) < 10 and abs(latitude_x) > 5')['diff_lon'].mean())
print('lat10-20: lat=', all_join.query('abs(latitude_x) < 20 and abs(latitude_x) > 10')['diff_lat'].mean(), 'lon=', all_join.query('abs(latitude_x) < 20 and abs(latitude_x) > 10')['diff_lon'].mean())
print('lat30-40: lat=', all_join.query('abs(latitude_x) < 40 and abs(latitude_x) > 30')['diff_lat'].mean(), 'lon=', all_join.query('abs(latitude_x) < 40 and abs(latitude_x) > 30')['diff_lon'].mean())
print('lat40-60: lat=', all_join.query('abs(latitude_x) < 60 and abs(latitude_x) > 40')['diff_lat'].mean(), 'lon=', all_join.query('abs(latitude_x) < 60 and abs(latitude_x) > 40')['diff_lon'].mean())
print('lat60-: lat=', all_join.query('abs(latitude_x) > 60')['diff_lat'].mean(), 'lon=', all_join.query('abs(latitude_x) > 60')['diff_lon'].mean())

full: lat= 0.7997279233652254 lon= 1.6602567264153405
lat<5: lat= 1.718177569519094 lon= 2.53938476124143
lat5-10: lat= 0.9174846712170145 lon= 1.523440098537588
lat10-20: lat= 0.38209918989309555 lon= 0.49837086346911647
lat30-40: lat= 0.3543014012628672 lon= 0.7679053503999849
lat40-60: lat= 0.8290264529915744 lon= 2.469199935502229
lat60-: lat= 0.8803260852735523 lon= 2.514815181845604


### 実際の距離ベース

In [57]:
print('full: lat=', all_join['dlat'].mean(), 'lon=', all_join['dlon'].mean(), all_join['dlon'].mean()/all_join['dlat'].mean())
print('lat<5: lat=', all_join.query('abs(latitude_x) < 5')['dlat'].mean(), 'lon=', all_join.query('abs(latitude_x) < 5')['dlon'].mean(), all_join.query('abs(latitude_x) < 5')['dlon'].mean()/all_join.query('abs(latitude_x) < 5')['dlat'].mean())
print('lat5-10: lat=', all_join.query('abs(latitude_x) < 10 and abs(latitude_x) > 5')['dlat'].mean(), 'lon=', all_join.query('abs(latitude_x) < 10 and abs(latitude_x) > 5')['dlon'].mean(), all_join.query('abs(latitude_x) < 10 and abs(latitude_x) > 5')['dlon'].mean()/all_join.query('abs(latitude_x) < 10 and abs(latitude_x) > 5')['dlat'].mean())
print('lat10-20: lat=', all_join.query('abs(latitude_x) < 20 and abs(latitude_x) > 10')['dlat'].mean(), 'lon=', all_join.query('abs(latitude_x) < 20 and abs(latitude_x) > 10')['dlon'].mean(), all_join.query('abs(latitude_x) < 20 and abs(latitude_x) > 10')['dlon'].mean()/all_join.query('abs(latitude_x) < 20 and abs(latitude_x) > 10')['dlat'].mean())
print('lat20-30: lat=', all_join.query('abs(latitude_x) < 30 and abs(latitude_x) > 20')['dlat'].mean(), 'lon=', all_join.query('abs(latitude_x) < 30 and abs(latitude_x) > 20')['dlon'].mean(), all_join.query('abs(latitude_x) < 30 and abs(latitude_x) > 20')['dlon'].mean()/all_join.query('abs(latitude_x) < 30 and abs(latitude_x) > 20')['dlat'].mean())
print('lat30-40: lat=', all_join.query('abs(latitude_x) < 40 and abs(latitude_x) > 30')['dlat'].mean(), 'lon=', all_join.query('abs(latitude_x) < 40 and abs(latitude_x) > 30')['dlon'].mean(), all_join.query('abs(latitude_x) < 40 and abs(latitude_x) > 30')['dlon'].mean()/all_join.query('abs(latitude_x) < 40 and abs(latitude_x) > 30')['dlat'].mean())
print('lat40-60: lat=', all_join.query('abs(latitude_x) < 60 and abs(latitude_x) > 40')['dlat'].mean(), 'lon=', all_join.query('abs(latitude_x) < 60 and abs(latitude_x) > 40')['dlon'].mean(), all_join.query('abs(latitude_x) < 60 and abs(latitude_x) > 40')['dlon'].mean()/all_join.query('abs(latitude_x) < 60 and abs(latitude_x) > 40')['dlat'].mean())
print('lat60-: lat=', all_join.query('abs(latitude_x) > 60')['dlat'].mean(), 'lon=', all_join.query('abs(latitude_x) > 60')['dlon'].mean(), all_join.query('abs(latitude_x) > 60')['dlon'].mean()/all_join.query('abs(latitude_x) > 60')['dlat'].mean())

full: lat= 88.9256877742016 lon= 138.01056477340907 1.5519763549520453
lat<5: lat= 191.05262880500183 lon= 256.82227396224977 1.3442488363998164
lat5-10: lat= 102.019640713483 lon= 149.51434160441946 1.465544679032177
lat10-20: lat= 42.48749139110809 lon= 44.148342799809384 1.0390903617587761
lat20-30: lat= 91.37446446561786 lon= 106.54552366347535 1.166031716700961
lat30-40: lat= 39.39651832348887 lon= 61.26148142605389 1.5549973457814104
lat40-60: lat= 92.18353562679691 lon= 173.65831192495335 1.8838321913362637
lat60-: lat= 97.8877944752842 lon= 134.7228797302059 1.3762990621289588


In [64]:
# 北半球
print('full: lat=', all_join['dlat'].mean(), 'lon=', all_join['dlon'].mean(), all_join['dlon'].mean()/all_join['dlat'].mean())
print('lat<5: lat=', all_join.query('latitude_x < 5 and latitude_x > 0')['dlat'].mean(), 'lon=', all_join.query('latitude_x < 5 and latitude_x > 0')['dlon'].mean(), all_join.query('latitude_x < 5 and latitude_x > 0')['dlon'].mean()/all_join.query('latitude_x < 5 and latitude_x > 0')['dlat'].mean())
print('lat5-10: lat=', all_join.query('latitude_x < 10 and latitude_x > 5')['dlat'].mean(), 'lon=', all_join.query('latitude_x < 10 and latitude_x > 5')['dlon'].mean(), all_join.query('latitude_x < 10 and latitude_x > 5')['dlon'].mean()/all_join.query('latitude_x < 10 and latitude_x > 5')['dlat'].mean())
print('lat10-20: lat=', all_join.query('latitude_x < 20 and latitude_x > 10')['dlat'].mean(), 'lon=', all_join.query('latitude_x < 20 and latitude_x > 10')['dlon'].mean(), all_join.query('latitude_x < 20 and latitude_x > 10')['dlon'].mean()/all_join.query('latitude_x < 20 and latitude_x > 10')['dlat'].mean())
print('lat20-30: lat=', all_join.query('latitude_x < 30 and latitude_x > 20')['dlat'].mean(), 'lon=', all_join.query('latitude_x < 30 and latitude_x > 20')['dlon'].mean(), all_join.query('latitude_x < 30 and latitude_x > 20')['dlon'].mean()/all_join.query('latitude_x < 30 and latitude_x > 20')['dlat'].mean())
print('lat30-40: lat=', all_join.query('latitude_x < 40 and latitude_x > 30')['dlat'].mean(), 'lon=', all_join.query('latitude_x < 40 and latitude_x > 30')['dlon'].mean(), all_join.query('latitude_x < 40 and latitude_x > 30')['dlon'].mean()/all_join.query('latitude_x < 40 and latitude_x > 30')['dlat'].mean())
print('lat40-60: lat=', all_join.query('latitude_x < 60 and latitude_x > 40')['dlat'].mean(), 'lon=', all_join.query('latitude_x < 60 and latitude_x > 40')['dlon'].mean(), all_join.query('latitude_x < 60 and latitude_x > 40')['dlon'].mean()/all_join.query('latitude_x < 60 and latitude_x > 40')['dlat'].mean())
print('lat60-: lat=', all_join.query('latitude_x > 60')['dlat'].mean(), 'lon=', all_join.query('latitude_x > 60')['dlon'].mean(), all_join.query('latitude_x > 60')['dlon'].mean()/all_join.query('latitude_x > 60')['dlat'].mean())

full: lat= 88.9256877742016 lon= 138.01056477340907 1.5519763549520453
lat<5: lat= 154.55909424758448 lon= 184.35599766541642 1.1927864779674562
lat5-10: lat= 183.34783570049177 lon= 139.96870883222854 0.763405296263626
lat10-20: lat= 36.25100264011122 lon= 28.983494795738732 0.7995225699955926
lat20-30: lat= 108.14274154137053 lon= 127.8844233535138 1.182552074515246
lat30-40: lat= 34.435854098031506 lon= 57.1431771706552 1.659409318205984
lat40-60: lat= 92.21084242357979 lon= 173.89386209291837 1.8858287975954
lat60-: lat= 90.79373119433288 lon= 132.32978750981482 1.457477138224214


In [30]:
171.374940/120.157297

1.42625495312199