# exp032_trial
処理変更部の確認

In [1]:
MODE = 'local_train'
#MODE = 'kaggle_inference'

In [2]:
exp_name = 'exp032'
memo = '1st改善'

In [3]:
import os
import sys
import gc

if MODE == 'local_train':
    sys.path.append('/home/kaggler/.local/lib/python3.8/site-packages')
    from dotenv import load_dotenv
    load_dotenv
    sys.path.append(os.getenv('UTILS_PATH'))
    import line_notify
    import slack_notify
    
if MODE == "kaggle_inference":
    from cuml import ForestInference
    import treelite
    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt; plt.style.use("ggplot")
import seaborn as sns
from sklearn.metrics.pairwise import haversine_distances
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
import lightgbm as lgb
import itertools
from scipy.spatial.distance import canberra
from sklearn.neighbors import KNeighborsRegressor
import functools
import multiprocessing
import Levenshtein
import difflib
import pickle
from tqdm import tqdm
%load_ext Cython

from transformers import DistilBertModel, DistilBertTokenizer
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torch.nn as nn
import torch

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

from cuml.feature_extraction.text import TfidfVectorizer as TfidfVectorizer_gpu
import cudf, cuml, cupy
from cuml.neighbors import NearestNeighbors as NearestNeighbors_gpu

In [4]:
# directry_setting
if MODE == 'local_train':
    INPUT_DIR = os.getenv('INPUT_DIR')
    OUTPUT_DIR = os.getenv('OUTPUT_DIR')
    MODEL_DIR = os.getenv('OUTPUT_DIR')
    BERT_MODEL = "distilbert-base-multilingual-cased"
    #os.makedirs(OUTPUT_DIR + exp_name, exist_ok=True)

elif MODE == 'kaggle_inference':
    INPUT_DIR = '/kaggle/input/foursquare-location-matching/'
    OUTPUT_DIR = './'
    MODEL_DIR = f'../input/fs{exp_name}/'
    BERT_MODEL = "../input/distilbertbaseuncased"

In [5]:
# CONFIG
SEED = 42
N_NEIGHBORS = 10
N_SPLITS = 5
PROB_TH = 0.5
MAX_LEN = 32
BS = 512
NW = 2
SVD_N_COMP = 50

In [6]:
def preprocess(df):
    columns = ['id', 'name', 'address', 'city', 'state',
        'zip', 'country', 'url', 'phone', 'categories']
    for c in columns:
        if c != "id":
            df[c] = df[c].astype(str).str.lower()

    df[["latitude", "longitude"]] = np.deg2rad(df[["latitude", "longitude"]])
    
    return df

In [7]:
def extract_candidate_dist(df):
    dfs = []
    for country, country_df in tqdm(df.groupby("country")):
        country_df = country_df.reset_index(drop=True)
        
        knn = KNeighborsRegressor(n_neighbors=min(len(country_df), N_NEIGHBORS), 
                                    metric='haversine', n_jobs=-1)
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        nears = knn.kneighbors(country_df[['latitude','longitude']], return_distance=False)
        
        k = min(len(country_df), N_NEIGHBORS)
        country_df['match_id'] = country_df['id'].values[nears[:, :k]].tolist()
        country_df = country_df.explode(['match_id'])
        country_df = country_df.loc[country_df['id'] != country_df['match_id']].copy()
        dfs.append(country_df)
    df = pd.concat(dfs).reset_index(drop=True)
    return df

In [13]:
def extract_candidate_tfidf_sim(df, col):
    dfs = []
    for country, country_df in tqdm(df.groupby("country")):
        country_df = country_df.reset_index(drop=True)
        
        model = TfidfVectorizer_gpu(stop_words='english', binary=True)
        text_embeddings = model.fit_transform(cudf.from_pandas(country_df[col]))

        model = NearestNeighbors_gpu(n_neighbors=min(len(country_df), N_NEIGHBORS), algorithm="brute")
        model.fit(text_embeddings)
        nears = model.kneighbors(text_embeddings, return_distance=False)
        
        k = min(len(country_df), N_NEIGHBORS)
        country_df['match_id'] = country_df['id'].values[nears[:, :k].get()].tolist()
        country_df = country_df.explode(['match_id'])
        country_df = country_df.loc[country_df['id'] != country_df['match_id']].copy()
        dfs.append(country_df)
    df = pd.concat(dfs).reset_index(drop=True)
    return df

In [9]:
def add_orgin_data(df, org_df):
    df = df.merge(org_df.add_prefix('match_'), on='match_id')
    df = df.reset_index(drop=True)
    return df

In [15]:
def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    #lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    return c

In [10]:
# https://www.kaggle.com/code/columbia2131/foursquare-iou-metrics
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame, org_data):
    scores = []
    id2poi = get_id2poi(org_data)
    poi2ids = get_poi2ids(org_data)
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def calc_max_score(tr_data, org_data):
    train_candidate = pd.DataFrame()
    train_candidate['id'] = org_data['id'].unique()
    train_candidate['matches'] = org_data['id'].unique()
    idx = tr_data['point_of_interest']==tr_data['match_point_of_interest']
    train_match = tr_data.loc[idx].groupby('id')['match_id'].apply(list).map(" ".join).reset_index()
    train_match.columns = ['id','candidates']
    train_candidate = train_candidate.merge(train_match, on = 'id', how = 'left')
    idx = ~train_candidate['candidates'].isna()
    train_candidate.loc[idx, "matches"] += " " + train_candidate.loc[idx, "candidates"]
    score = get_score(train_candidate, org_data)
    print('1st_stage_max_score : ' + '{:.5f}'.format(score))
    return score

In [11]:
train_origin = pd.read_csv(INPUT_DIR + "train.csv")
train_origin = preprocess(train_origin)

# trainデータの分割
kf = GroupKFold(n_splits=2)
for i, (trn_idx, val_idx) in enumerate(kf.split(train_origin, train_origin['point_of_interest'], train_origin['point_of_interest'])):
    train_origin.loc[val_idx, "set"] = i

In [16]:
condition = "dist10 + namesim10"
N_NEIGHBORS = 10
dist_df = extract_candidate_dist(train_origin[train_origin["set"]==0])
namesim_df = extract_candidate_tfidf_sim(train_origin[train_origin["set"]==0], "name")

100%|██████████| 210/210 [00:38<00:00,  5.47it/s]
100%|██████████| 210/210 [00:39<00:00,  5.26it/s]


In [17]:
dist_df

Unnamed: 0,id,name,latitude,longitude,address,city,state,zip,country,url,phone,categories,point_of_interest,set,match_id
0,E_1104d3664585a4,mamá maría,0.741907,0.026620,"av. meritxell, 25",andorra la vella,parròquia d'andorra la vella,,ad,,,"italian restaurants, pizza places",P_a12b9416902417,0.0,E_be8ec27bf1b71b
1,E_1104d3664585a4,mamá maría,0.741907,0.026620,"av. meritxell, 25",andorra la vella,parròquia d'andorra la vella,,ad,,,"italian restaurants, pizza places",P_a12b9416902417,0.0,E_5ccc58e81a3c11
2,E_1104d3664585a4,mamá maría,0.741907,0.026620,"av. meritxell, 25",andorra la vella,parròquia d'andorra la vella,,ad,,,"italian restaurants, pizza places",P_a12b9416902417,0.0,E_bc8a2b3674875b
3,E_1104d3664585a4,mamá maría,0.741907,0.026620,"av. meritxell, 25",andorra la vella,parròquia d'andorra la vella,,ad,,,"italian restaurants, pizza places",P_a12b9416902417,0.0,E_4a34a5012e88c3
4,E_1104d3664585a4,mamá maría,0.741907,0.026620,"av. meritxell, 25",andorra la vella,parròquia d'andorra la vella,,ad,,,"italian restaurants, pizza places",P_a12b9416902417,0.0,E_c93b28d5249c89
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5124281,E_6d33113f66fd27,holiday inn,-0.351868,0.499320,p.o. box ac88,bulawayo,,,zw,http://ihg.co/buqwo4s,9252464,hotels,P_59885d1433ff74,0.0,E_f490b8a74a551e
5124282,E_8c77abc0296481,holiday inn,-0.351472,0.498970,p.o. box ac88,bulawayo,,,zw,http://ihg.co/buqwo4s,9252464,hotels,P_59885d1433ff74,0.0,E_6d33113f66fd27
5124283,E_8c77abc0296481,holiday inn,-0.351472,0.498970,p.o. box ac88,bulawayo,,,zw,http://ihg.co/buqwo4s,9252464,hotels,P_59885d1433ff74,0.0,E_f490b8a74a551e
5124284,E_f490b8a74a551e,fanoos electronics,-0.311186,0.541896,,,,,zw,,,electronics stores,P_fb45f045b467b7,0.0,E_8c77abc0296481


In [18]:
namesim_df

Unnamed: 0,id,name,latitude,longitude,address,city,state,zip,country,url,phone,categories,point_of_interest,set,match_id
0,E_1104d3664585a4,mamá maría,0.741907,0.026620,"av. meritxell, 25",andorra la vella,parròquia d'andorra la vella,,ad,,,"italian restaurants, pizza places",P_a12b9416902417,0.0,E_be8ec27bf1b71b
1,E_1104d3664585a4,mamá maría,0.741907,0.026620,"av. meritxell, 25",andorra la vella,parròquia d'andorra la vella,,ad,,,"italian restaurants, pizza places",P_a12b9416902417,0.0,E_20b7920b07557d
2,E_1104d3664585a4,mamá maría,0.741907,0.026620,"av. meritxell, 25",andorra la vella,parròquia d'andorra la vella,,ad,,,"italian restaurants, pizza places",P_a12b9416902417,0.0,E_9acdcda349c154
3,E_1104d3664585a4,mamá maría,0.741907,0.026620,"av. meritxell, 25",andorra la vella,parròquia d'andorra la vella,,ad,,,"italian restaurants, pizza places",P_a12b9416902417,0.0,E_dfa69a18228406
4,E_1104d3664585a4,mamá maría,0.741907,0.026620,"av. meritxell, 25",andorra la vella,parròquia d'andorra la vella,,ad,,,"italian restaurants, pizza places",P_a12b9416902417,0.0,E_c93b28d5249c89
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5169546,E_6d33113f66fd27,holiday inn,-0.351868,0.499320,p.o. box ac88,bulawayo,,,zw,http://ihg.co/buqwo4s,9252464,hotels,P_59885d1433ff74,0.0,E_f490b8a74a551e
5169547,E_8c77abc0296481,holiday inn,-0.351472,0.498970,p.o. box ac88,bulawayo,,,zw,http://ihg.co/buqwo4s,9252464,hotels,P_59885d1433ff74,0.0,E_6d33113f66fd27
5169548,E_8c77abc0296481,holiday inn,-0.351472,0.498970,p.o. box ac88,bulawayo,,,zw,http://ihg.co/buqwo4s,9252464,hotels,P_59885d1433ff74,0.0,E_f490b8a74a551e
5169549,E_f490b8a74a551e,fanoos electronics,-0.311186,0.541896,,,,,zw,,,electronics stores,P_fb45f045b467b7,0.0,E_6d33113f66fd27


In [19]:
train = pd.concat([dist_df, namesim_df])

In [20]:
train = add_orgin_data(train, train_origin)

In [21]:
train["habersine_dist"] = haversine_np(train["longitude"], train["latitude"], train["match_longitude"], train["match_latitude"])

In [23]:
train[["phone", "match_phone"]]

Unnamed: 0,phone,match_phone
0,,+376869996
1,871787,+376869996
2,,+376869996
3,+376753191,+376869996
4,737350,+376869996
...,...,...
10293832,,
10293833,119338000,
10293834,114727430,
10293835,,


In [25]:
dist_df["d_near"]

0          0.000109
1          0.002263
2           0.00126
3          0.001542
4          0.000765
             ...   
5124281    0.057186
5124282     0.05723
5124283    0.057186
5124284    0.000514
5124285     0.05723
Name: d_near, Length: 5124286, dtype: object

In [22]:
from sklearn.metrics.pairwise import haversine_distances
haversine_np(dist_df["longitude"], dist_df["latitude"], dist_df["match_longitude"], dist_df["match_latitude"])

0          0.000109
1          0.002263
2          0.001260
3          0.001542
4          0.000765
             ...   
5124281    0.057186
5124282    0.057230
5124283    0.057186
5124284    0.000514
5124285    0.057230
Length: 5124286, dtype: float64