# exp051_trial

In [1]:
MODE = 'local_train'
#MODE = 'kaggle_inference'

In [2]:
exp_name = 'exp051'
memo = '1st改善 s-bert'

In [3]:
import os
import sys
import gc

if MODE == 'local_train':
    sys.path.append('/home/kaggler/.local/lib/python3.8/site-packages')
    from dotenv import load_dotenv
    load_dotenv
    sys.path.append(os.getenv('UTILS_PATH'))
    import line_notify
    import slack_notify
    
if MODE == "kaggle_inference":
    from cuml import ForestInference
    import treelite
    
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm
import matplotlib.pyplot as plt; plt.style.use("ggplot")
import seaborn as sns
from sklearn.metrics.pairwise import haversine_distances
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
import lightgbm as lgb
import itertools
from scipy.spatial.distance import canberra
from cuml.neighbors import KNeighborsRegressor
import functools
import multiprocessing
import Levenshtein
import difflib
import pickle
from tqdm import tqdm
%load_ext Cython

from transformers import DistilBertModel, DistilBertTokenizer
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torch.nn as nn
import torch

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

from unidecode import unidecode
import pykakasi

from sentence_transformers import SentenceTransformer

In [4]:
# directry_setting
if MODE == 'local_train':
    INPUT_DIR = os.getenv('INPUT_DIR')
    OUTPUT_DIR = os.getenv('OUTPUT_DIR')
    MODEL_DIR = os.getenv('OUTPUT_DIR')
    BERT_MODEL = "distilbert-base-multilingual-cased"
    os.makedirs(OUTPUT_DIR + exp_name, exist_ok=True)

elif MODE == 'kaggle_inference':
    INPUT_DIR = '/kaggle/input/foursquare-location-matching/'
    OUTPUT_DIR = './'
    MODEL_DIR = f'../input/fs{exp_name}/'
    BERT_MODEL = "../input/distilbertbaseuncased"

In [5]:
# CONFIG
SEED = 42
N_NEIGHBORS = 10
N_SPLITS = 5
PROB_TH = 0.5
MAX_LEN = 32
BS = 512
NW = 2
SVD_N_COMP = 50

In [6]:
class Cat2VecModel(nn.Module):
    def __init__(self):
        super(Cat2VecModel, self).__init__()
        self.distill_bert = DistilBertModel.from_pretrained(BERT_MODEL)
        
    def forward(self, ids, mask):
        x = self.distill_bert(ids, mask)[0]
        x = F.normalize((x[:, 1:, :]*mask[:, 1:, None]).mean(axis=1))
        return x

class InferenceDataset(Dataset):
    
    def __init__(self, df, max_len, col):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.max_len = max_len
        self.tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True)
        self.col = col

    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        inputs = self.tokenizer.encode_plus(
            row[self.col],
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = torch.LongTensor(inputs['input_ids'])
        mask = torch.LongTensor(inputs['attention_mask'])

        return ids, mask

    def __len__(self):
        return self.df.shape[0]

def inference(ds):
    cat2vec_model = Cat2VecModel()
    cat2vec_model = cat2vec_model.cuda()
    
    loader = DataLoader(ds, batch_size=BS, shuffle=False, num_workers=NW,
                        pin_memory=False, drop_last=False)
    
    vs = []
    with torch.no_grad():
        for idx, (ids, masks) in enumerate(loader):
            v = cat2vec_model(ids.cuda(), masks.cuda()).detach().cpu().numpy()
            vs.append(v)
    return np.concatenate(vs)

In [7]:
def make_bert_vec(df, col):
    cat_df = df[[col]].drop_duplicates()
    cat_df[col] = cat_df[col].fillna("null")

    cat_ds = InferenceDataset(cat_df, max_len=MAX_LEN, col=col)
    V = inference(cat_ds)
    #svd = TruncatedSVD(n_components=SVD_N_COMP, random_state=SEED)
    #V = svd.fit_transform(V)
    V = V.astype("float16")
    bert_vec = {k:v for k,v in zip(cat_df[col].values, V)}
    return bert_vec

In [8]:
def add_vec_sim_features(df, vec, vec_name, col):
    sim_list = []
    for str1, str2 in tqdm(df[[f"{col}", f"match_{col}"]].values.astype(str)):
        if str1!="nan" and str2!="nan":
            sim = dot(vec[str1], vec[str2]) / (norm(vec[str1])*norm(vec[str2]))
        else:
            sim = -1
        sim_list.append(sim)
    df[f"{vec_name}_sim_{col}"] = sim_list
    return df

In [9]:
def preprocess(df):
    columns = ['id', 'name', 'address', 'city', 'state',
        'zip', 'country', 'url', 'phone', 'categories']
    for c in columns:
        if c != "id":
            df[c] = df[c].astype(str).str.lower()

    df[["latitude", "longitude"]] = np.deg2rad(df[["latitude", "longitude"]])
    
    return df

In [10]:
from cuml.feature_extraction.text import TfidfVectorizer as TfidfVectorizer_gpu
import cudf, cuml, cupy
from cuml.neighbors import NearestNeighbors as NearestNeighbors_gpu

In [11]:
def extract_candidate_dist(df):
    dfs = []
    for country, country_df in tqdm(df.groupby("country")):
        country_df = country_df.reset_index(drop=True)
        
        knn = KNeighborsRegressor(n_neighbors=min(len(country_df), N_NEIGHBORS), 
                                    metric='haversine', algorithm="brute")
        knn.fit(country_df[['latitude','longitude']], country_df.index.values)
        nears = knn.kneighbors(country_df[['latitude','longitude']], return_distance=False)
        
        k = min(len(country_df), N_NEIGHBORS)
        country_df['match_id'] = country_df['id'].values[nears[:, :k]].tolist()
        country_df = country_df.explode(['match_id'])
        country_df = country_df.loc[country_df['id'] != country_df['match_id']].copy()
        dfs.append(country_df)
    df = pd.concat(dfs).reset_index(drop=True)
    return df

In [12]:
def extract_candidate_tfidf_sim(df, col):
    dfs = []
    for country, country_df in tqdm(df.groupby("country")):
        country_df = country_df[country_df[col]!="nan"].copy()
        if len(country_df) < 2:
            continue

        country_df = country_df.reset_index(drop=True)
        
        model = TfidfVectorizer(ngram_range=(3,3), analyzer="char_wb", use_idf=True)
        text_embeddings = model.fit_transform(country_df[col].tolist())

        model = NearestNeighbors_gpu(n_neighbors=min(len(country_df), N_NEIGHBORS), algorithm="brute")
        model.fit(text_embeddings)
        nears = model.kneighbors(text_embeddings, return_distance=False)
        
        k = min(len(country_df), N_NEIGHBORS)
        country_df['match_id'] = country_df['id'].values[nears[:, :k]].tolist()
        country_df = country_df.explode(['match_id'])
        country_df = country_df.loc[country_df['id'] != country_df['match_id']].copy()
        dfs.append(country_df)
    df = pd.concat(dfs).reset_index(drop=True)
    return df

In [13]:
def make_sbert_vec(df, col, model_name):
    cat_df = df[[col]].drop_duplicates()
    sbert_model = SentenceTransformer(model_name)
    V = sbert_model.encode(cat_df[col].tolist())
    V = V.astype("float16")
    sbert_vec = {k:v for k,v in zip(cat_df[col].values, V)}
    return sbert_vec

In [14]:
def extract_candidate_sbert_sim(df, col, model_name):
    dfs = []
    vecs = make_sbert_vec(df,col, model_name)
    for country, country_df in tqdm(df.groupby("country")):
        country_df = country_df[country_df[col]!="nan"].copy()
        if len(country_df) < 2:
            continue

        country_df = country_df.reset_index(drop=True)
        text_embeddings = np.vstack([vecs[str_] for str_ in country_df[col].values])
        
        model = NearestNeighbors_gpu(n_neighbors=min(len(country_df), N_NEIGHBORS), algorithm="brute")
        model.fit(text_embeddings)
        nears = model.kneighbors(text_embeddings, return_distance=False)
        
        k = min(len(country_df), N_NEIGHBORS)
        country_df['match_id'] = country_df['id'].values[nears[:, :k]].tolist()
        country_df = country_df.explode(['match_id'])
        country_df = country_df.loc[country_df['id'] != country_df['match_id']].copy()
        dfs.append(country_df)
    df = pd.concat(dfs).reset_index(drop=True)
    return df

In [15]:
def add_orgin_data(df, org_df):
    df = df.merge(org_df.add_prefix('match_'), on='match_id')
    df = df.reset_index(drop=True)
    return df

In [16]:
# https://www.kaggle.com/code/columbia2131/foursquare-iou-metrics
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame, org_data):
    scores = []
    id2poi = get_id2poi(org_data)
    poi2ids = get_poi2ids(org_data)
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def calc_max_score(tr_data, org_data):
    train_candidate = pd.DataFrame()
    train_candidate['id'] = org_data['id'].unique()
    train_candidate['matches'] = org_data['id'].unique()
    idx = tr_data['point_of_interest']==tr_data['match_point_of_interest']
    train_match = tr_data.loc[idx].groupby('id')['match_id'].apply(list).map(" ".join).reset_index()
    train_match.columns = ['id','candidates']
    train_candidate = train_candidate.merge(train_match, on = 'id', how = 'left')
    idx = ~train_candidate['candidates'].isna()
    train_candidate.loc[idx, "matches"] += " " + train_candidate.loc[idx, "candidates"]
    score = get_score(train_candidate, org_data)
    print('1st_stage_max_score : ' + '{:.5f}'.format(score))
    return score

実験

In [17]:
train_origin = pd.read_csv(INPUT_DIR + "train.csv")
train_origin = preprocess(train_origin)

train_origin = train_origin[train_origin["country"]=="jp"].copy()
train_origin = train_origin.reset_index(drop=True)

In [18]:
conds = []
scores = []

In [19]:
cond = "tfidf"
train = extract_candidate_tfidf_sim(train_origin, "name")
train = train.merge(train_origin[["id", "point_of_interest"]].add_prefix("match_"), on="match_id", how="left")
score = calc_max_score(train, train_origin)
conds.append(cond)
scores.append(score)

100%|██████████| 1/1 [00:07<00:00,  7.25s/it]


1st_stage_max_score : 0.89251


In [20]:
cond = "sbert(LaBSE)"
train = extract_candidate_sbert_sim(train_origin, "name", "LaBSE")
train = train.merge(train_origin[["id", "point_of_interest"]].add_prefix("match_"), on="match_id", how="left")
score = calc_max_score(train, train_origin)
conds.append(cond)
scores.append(score)

100%|██████████| 1/1 [00:02<00:00,  2.25s/it]


1st_stage_max_score : 0.84026


In [22]:
cond = "sbert(stsb-xlm-r-multilingual)"
train = extract_candidate_sbert_sim(train_origin, "name", "stsb-xlm-r-multilingual")
train = train.merge(train_origin[["id", "point_of_interest"]].add_prefix("match_"), on="match_id", how="left")
score = calc_max_score(train, train_origin)
conds.append(cond)
scores.append(score)

100%|██████████| 1/1 [00:02<00:00,  2.03s/it]


1st_stage_max_score : 0.78491


In [24]:
cond = "sbert(paraphrase-multilingual-mpnet-base-v2)"
train = extract_candidate_sbert_sim(train_origin, "name", "paraphrase-multilingual-mpnet-base-v2")
train = train.merge(train_origin[["id", "point_of_interest"]].add_prefix("match_"), on="match_id", how="left")
score = calc_max_score(train, train_origin)
conds.append(cond)
scores.append(score)

100%|██████████| 1/1 [00:02<00:00,  2.05s/it]


1st_stage_max_score : 0.78965


In [25]:
result = pd.DataFrame()
result["条件"] = conds
result["1st_max_score"] = scores
result

Unnamed: 0,条件,1st_max_score
0,tfidf,0.892511
1,sbert(LaBSE),0.840256
2,sbert(stsb-xlm-r-multilingual),0.784906
3,sbert(paraphrase-multilingual-mpnet-base-v2),0.78965
