# exp032_trial

In [1]:
MODE = 'local_train'
#MODE = 'kaggle_inference'

In [2]:
exp_name = 'exp032'
memo = '1st改善'

In [3]:
import os
import sys
import gc

if MODE == 'local_train':
    sys.path.append('/home/kaggler/.local/lib/python3.8/site-packages')
    from dotenv import load_dotenv
    load_dotenv
    sys.path.append(os.getenv('UTILS_PATH'))
    import line_notify
    import slack_notify
    
if MODE == "kaggle_inference":
    from cuml import ForestInference
    import treelite
    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt; plt.style.use("ggplot")
import seaborn as sns
from sklearn.metrics.pairwise import haversine_distances
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
import lightgbm as lgb
import itertools
from scipy.spatial.distance import canberra
from sklearn.neighbors import KNeighborsRegressor
import functools
import multiprocessing
import Levenshtein
import difflib
import pickle
from tqdm import tqdm
%load_ext Cython

from transformers import DistilBertModel, DistilBertTokenizer
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torch.nn as nn
import torch

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
# directry_setting
if MODE == 'local_train':
    INPUT_DIR = os.getenv('INPUT_DIR')
    OUTPUT_DIR = os.getenv('OUTPUT_DIR')
    MODEL_DIR = os.getenv('OUTPUT_DIR')
    BERT_MODEL = "distilbert-base-multilingual-cased"
    #os.makedirs(OUTPUT_DIR + exp_name, exist_ok=True)

elif MODE == 'kaggle_inference':
    INPUT_DIR = '/kaggle/input/foursquare-location-matching/'
    OUTPUT_DIR = './'
    MODEL_DIR = f'../input/fs{exp_name}/'
    BERT_MODEL = "../input/distilbertbaseuncased"

In [5]:
# CONFIG
SEED = 42
N_NEIGHBORS = 10
N_SPLITS = 5
PROB_TH = 0.5
MAX_LEN = 32
BS = 512
NW = 2
SVD_N_COMP = 50

In [6]:
def preprocess(df):
    columns = ['id', 'name', 'address', 'city', 'state',
        'zip', 'country', 'url', 'phone', 'categories']
    for c in columns:
        if c != "id":
            df[c] = df[c].astype(str).str.lower()

    df[["latitude", "longitude"]] = np.deg2rad(df[["latitude", "longitude"]])
    
    return df

In [7]:
from cuml.feature_extraction.text import TfidfVectorizer as TfidfVectorizer_gpu
import cudf, cuml, cupy
from cuml.neighbors import NearestNeighbors as NearestNeighbors_gpu

In [8]:
def extract_candidate(df):
    dfs = []
    for country, country_df in tqdm(df.groupby("country")):
        country_df = country_df.reset_index(drop=True)
        
        knn = KNeighborsRegressor(n_neighbors=min(len(country_df), N_NEIGHBORS), 
                                    metric='haversine', n_jobs=-1)
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude','longitude']], return_distance=True)
        
        k = min(len(country_df), N_NEIGHBORS)
        country_df['match_id'] = country_df['id'].values[nears[:, :k]].tolist()
        country_df['d_near'] = dists[:, :k].tolist()
        country_df = country_df.explode(['match_id','d_near'])
        country_df = country_df.loc[country_df['id'] != country_df['match_id']].copy()
        dfs.append(country_df)
    df = pd.concat(dfs).reset_index(drop=True)
    return df

In [9]:
def extract_candidate_tfidf_name(df):
    dfs = []
    for country, country_df in tqdm(df.groupby("country")):
        country_df = country_df.reset_index(drop=True)
        
        model = TfidfVectorizer_gpu(stop_words='english', binary=True)
        text_embeddings = model.fit_transform(cudf.from_pandas(country_df["name"]))

        model = NearestNeighbors_gpu(n_neighbors=min(len(country_df), N_NEIGHBORS), algorithm="brute")
        model.fit(text_embeddings)
        dists, nears = model.kneighbors(text_embeddings)
        
        k = min(len(country_df), N_NEIGHBORS)
        country_df['match_id'] = country_df['id'].values[nears[:, :k].get()].tolist()
        country_df['d_near'] = dists[:, :k].tolist()
        country_df = country_df.explode(['match_id','d_near'])
        country_df = country_df.loc[country_df['id'] != country_df['match_id']].copy()
        dfs.append(country_df)
    df = pd.concat(dfs).reset_index(drop=True)
    return df

In [10]:
def add_orgin_data(df, org_df):
    df = df.merge(org_df.add_prefix('match_'), on='match_id')
    df = df.reset_index(drop=True)
    return df

In [11]:
# https://www.kaggle.com/code/columbia2131/foursquare-iou-metrics
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame, org_data):
    scores = []
    id2poi = get_id2poi(org_data)
    poi2ids = get_poi2ids(org_data)
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def calc_max_score(tr_data, org_data):
    train_candidate = pd.DataFrame()
    train_candidate['id'] = org_data['id'].unique()
    train_candidate['matches'] = org_data['id'].unique()
    idx = tr_data['point_of_interest']==tr_data['match_point_of_interest']
    train_match = tr_data.loc[idx].groupby('id')['match_id'].apply(list).map(" ".join).reset_index()
    train_match.columns = ['id','candidates']
    train_candidate = train_candidate.merge(train_match, on = 'id', how = 'left')
    idx = ~train_candidate['candidates'].isna()
    train_candidate.loc[idx, "matches"] += " " + train_candidate.loc[idx, "candidates"]
    score = get_score(train_candidate, org_data)
    print('1st_stage_max_score : ' + '{:.5f}'.format(score))
    return score

In [12]:
train_origin = pd.read_csv(INPUT_DIR + "train.csv")
train_origin = preprocess(train_origin)

# trainデータの分割
kf = GroupKFold(n_splits=2)
for i, (trn_idx, val_idx) in enumerate(kf.split(train_origin, train_origin['point_of_interest'], train_origin['point_of_interest'])):
    train_origin.loc[val_idx, "set"] = i

In [13]:
conditions = []
lens = []
scores = []

In [14]:
condition = "dist10"
N_NEIGHBORS = 10
dist_df = extract_candidate(train_origin[train_origin["set"]==0])
train = dist_df.copy()
train = train.drop_duplicates(subset=["id", "match_id"])
train = add_orgin_data(train, train_origin)
score = calc_max_score(train, train_origin)
conditions.append(condition)
lens.append(len(train))
scores.append(score)
print(condition, len(train), score)


100%|██████████| 210/210 [00:38<00:00,  5.39it/s]


1st_stage_max_score : 0.77353
dist10 5124286 0.7735309109844906


In [15]:
condition = "dist20"
N_NEIGHBORS = 20
dist_df = extract_candidate(train_origin[train_origin["set"]==0])
train = dist_df.copy()
train = train.drop_duplicates(subset=["id", "match_id"])
train = add_orgin_data(train, train_origin)
score = calc_max_score(train, train_origin)
conditions.append(condition)
lens.append(len(train))
scores.append(score)
print(condition, len(train), score)

100%|██████████| 210/210 [00:46<00:00,  4.50it/s]


1st_stage_max_score : 0.78469
dist20 10813857 0.7846874758669563


In [16]:
condition = "dist30"
N_NEIGHBORS = 30
dist_df = extract_candidate(train_origin[train_origin["set"]==0])
train = dist_df.copy()
train = train.drop_duplicates(subset=["id", "match_id"])
train = add_orgin_data(train, train_origin)
score = calc_max_score(train, train_origin)
conditions.append(condition)
lens.append(len(train))
scores.append(score)
print(condition, len(train), score)

100%|██████████| 210/210 [00:58<00:00,  3.58it/s]


1st_stage_max_score : 0.79018
dist30 16501507 0.7901793965925074


In [17]:
condition = "dist10 + namesim10"
N_NEIGHBORS = 10
dist_df = extract_candidate(train_origin[train_origin["set"]==0])
namesim_df = extract_candidate_tfidf_name(train_origin[train_origin["set"]==0])
train = pd.concat([dist_df, namesim_df])
train = train.drop_duplicates(subset=["id", "match_id"])
train = add_orgin_data(train, train_origin)
score = calc_max_score(train, train_origin)
conditions.append(condition)
lens.append(len(train))
scores.append(score)
print(condition, len(train), score)

100%|██████████| 210/210 [00:40<00:00,  5.17it/s]
100%|██████████| 210/210 [00:44<00:00,  4.69it/s]


1st_stage_max_score : 0.79885
dist10 + namesim10 10010687 0.7988485010234153


In [18]:
condition = "dist15 + namesim15"
N_NEIGHBORS = 15
dist_df = extract_candidate(train_origin[train_origin["set"]==0])
namesim_df = extract_candidate_tfidf_name(train_origin[train_origin["set"]==0])
train = pd.concat([dist_df, namesim_df])
train = train.drop_duplicates(subset=["id", "match_id"])
train = add_orgin_data(train, train_origin)
score = calc_max_score(train, train_origin)
conditions.append(condition)
lens.append(len(train))
scores.append(score)
print(condition, len(train), score)

100%|██████████| 210/210 [00:42<00:00,  4.97it/s]
100%|██████████| 210/210 [00:45<00:00,  4.64it/s]


1st_stage_max_score : 0.80366
dist15 + namesim15 15627837 0.8036587650040942


In [19]:
condition = "dist20 + namesim20"
N_NEIGHBORS = 20
dist_df = extract_candidate(train_origin[train_origin["set"]==0])
namesim_df = extract_candidate_tfidf_name(train_origin[train_origin["set"]==0])
train = pd.concat([dist_df, namesim_df])
train = train.drop_duplicates(subset=["id", "match_id"])
train = add_orgin_data(train, train_origin)
score = calc_max_score(train, train_origin)
conditions.append(condition)
lens.append(len(train))
scores.append(score)
print(condition, len(train), score)

100%|██████████| 210/210 [00:49<00:00,  4.20it/s]
100%|██████████| 210/210 [00:51<00:00,  4.11it/s]


1st_stage_max_score : 0.80649
dist20 + namesim20 21253775 0.8064851825864247
