# exp049_trial

In [1]:
MODE = 'local_train'
#MODE = 'kaggle_inference'

In [2]:
exp_name = 'exp049'
memo = '1st改善'

In [3]:
import os
import sys
import gc

if MODE == 'local_train':
    sys.path.append('/home/kaggler/.local/lib/python3.8/site-packages')
    from dotenv import load_dotenv
    load_dotenv
    sys.path.append(os.getenv('UTILS_PATH'))
    import line_notify
    import slack_notify
    
if MODE == "kaggle_inference":
    from cuml import ForestInference
    import treelite
    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt; plt.style.use("ggplot")
import seaborn as sns
from sklearn.metrics.pairwise import haversine_distances
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
import lightgbm as lgb
import itertools
from scipy.spatial.distance import canberra
from cuml.neighbors import KNeighborsRegressor
import functools
import multiprocessing
import Levenshtein
import difflib
import pickle
from tqdm import tqdm
%load_ext Cython

from transformers import DistilBertModel, DistilBertTokenizer
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torch.nn as nn
import torch

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

from unidecode import unidecode
import pykakasi

In [4]:
# directry_setting
if MODE == 'local_train':
    INPUT_DIR = os.getenv('INPUT_DIR')
    OUTPUT_DIR = os.getenv('OUTPUT_DIR')
    MODEL_DIR = os.getenv('OUTPUT_DIR')
    BERT_MODEL = "distilbert-base-multilingual-cased"
    os.makedirs(OUTPUT_DIR + exp_name, exist_ok=True)

elif MODE == 'kaggle_inference':
    INPUT_DIR = '/kaggle/input/foursquare-location-matching/'
    OUTPUT_DIR = './'
    MODEL_DIR = f'../input/fs{exp_name}/'
    BERT_MODEL = "../input/distilbertbaseuncased"

In [5]:
# CONFIG
SEED = 42
N_NEIGHBORS = 10
N_SPLITS = 5
PROB_TH = 0.5
MAX_LEN = 32
BS = 512
NW = 2
SVD_N_COMP = 50

In [6]:
class Cat2VecModel(nn.Module):
    def __init__(self):
        super(Cat2VecModel, self).__init__()
        self.distill_bert = DistilBertModel.from_pretrained(BERT_MODEL)
        
    def forward(self, ids, mask):
        x = self.distill_bert(ids, mask)[0]
        x = F.normalize((x[:, 1:, :]*mask[:, 1:, None]).mean(axis=1))
        return x

class InferenceDataset(Dataset):
    
    def __init__(self, df, max_len, col):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.max_len = max_len
        self.tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True)
        self.col = col

    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        inputs = self.tokenizer.encode_plus(
            row[self.col],
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = torch.LongTensor(inputs['input_ids'])
        mask = torch.LongTensor(inputs['attention_mask'])

        return ids, mask

    def __len__(self):
        return self.df.shape[0]

def inference(ds):
    cat2vec_model = Cat2VecModel()
    cat2vec_model = cat2vec_model.cuda()
    
    loader = DataLoader(ds, batch_size=BS, shuffle=False, num_workers=NW,
                        pin_memory=False, drop_last=False)
    
    vs = []
    with torch.no_grad():
        for idx, (ids, masks) in enumerate(loader):
            v = cat2vec_model(ids.cuda(), masks.cuda()).detach().cpu().numpy()
            vs.append(v)
    return np.concatenate(vs)

In [7]:
def make_bert_vec(df, col):
    cat_df = df[[col]].drop_duplicates()
    cat_df[col] = cat_df[col].fillna("null")

    cat_ds = InferenceDataset(cat_df, max_len=MAX_LEN, col=col)
    V = inference(cat_ds)
    #svd = TruncatedSVD(n_components=SVD_N_COMP, random_state=SEED)
    #V = svd.fit_transform(V)
    V = V.astype("float16")
    bert_vec = {k:v for k,v in zip(cat_df[col].values, V)}
    return bert_vec

In [8]:
def preprocess(df):
    columns = ['id', 'name', 'address', 'city', 'state',
        'zip', 'country', 'url', 'phone', 'categories']
    for c in columns:
        if c != "id":
            df[c] = df[c].astype(str).str.lower()

    df[["latitude", "longitude"]] = np.deg2rad(df[["latitude", "longitude"]])
    
    return df

In [9]:
from cuml.feature_extraction.text import TfidfVectorizer as TfidfVectorizer_gpu
import cudf, cuml, cupy
from cuml.neighbors import NearestNeighbors as NearestNeighbors_gpu

In [10]:
def extract_candidate_dist(df):
    dfs = []
    for country, country_df in tqdm(df.groupby("country")):
        country_df = country_df.reset_index(drop=True)
        
        knn = KNeighborsRegressor(n_neighbors=min(len(country_df), N_NEIGHBORS), 
                                    metric='haversine', algorithm="brute")
        knn.fit(country_df[['latitude','longitude']], country_df.index.values)
        nears = knn.kneighbors(country_df[['latitude','longitude']], return_distance=False)
        
        k = min(len(country_df), N_NEIGHBORS)
        country_df['match_id'] = country_df['id'].values[nears[:, :k]].tolist()
        country_df = country_df.explode(['match_id'])
        country_df = country_df.loc[country_df['id'] != country_df['match_id']].copy()
        dfs.append(country_df)
    df = pd.concat(dfs).reset_index(drop=True)
    return df

In [11]:
def extract_candidate_tfidf_sim(df, col):
    dfs = []
    for country, country_df in tqdm(df.groupby("country")):
        country_df = country_df[country_df[col]!="nan"].copy()
        if len(country_df) < 2:
            continue

        country_df = country_df.reset_index(drop=True)
        
        model = TfidfVectorizer(ngram_range=(3,3), analyzer="char_wb", use_idf=True)
        text_embeddings = model.fit_transform(country_df[col].tolist())

        model = NearestNeighbors_gpu(n_neighbors=min(len(country_df), N_NEIGHBORS), algorithm="brute")
        model.fit(text_embeddings)
        nears = model.kneighbors(text_embeddings, return_distance=False)
        
        k = min(len(country_df), N_NEIGHBORS)
        country_df['match_id'] = country_df['id'].values[nears[:, :k]].tolist()
        country_df = country_df.explode(['match_id'])
        country_df = country_df.loc[country_df['id'] != country_df['match_id']].copy()
        dfs.append(country_df)
    df = pd.concat(dfs).reset_index(drop=True)
    return df

In [12]:
def extract_candidate_bert_sim(df, col):
    dfs = []
    vecs = make_bert_vec(df,col)
    for country, country_df in tqdm(df.groupby("country")):
        country_df = country_df[country_df[col]!="nan"].copy()
        if len(country_df) < 2:
            continue

        country_df = country_df.reset_index(drop=True)
        text_embeddings = np.vstack([vecs[str_] for str_ in country_df[col].values])
        
        model = NearestNeighbors_gpu(n_neighbors=min(len(country_df), N_NEIGHBORS), algorithm="brute")
        model.fit(text_embeddings)
        nears = model.kneighbors(text_embeddings, return_distance=False)
        
        k = min(len(country_df), N_NEIGHBORS)
        country_df['match_id'] = country_df['id'].values[nears[:, :k]].tolist()
        country_df = country_df.explode(['match_id'])
        country_df = country_df.loc[country_df['id'] != country_df['match_id']].copy()
        dfs.append(country_df)
    df = pd.concat(dfs).reset_index(drop=True)
    return df

In [13]:
def add_orgin_data(df, org_df):
    df = df.merge(org_df.add_prefix('match_'), on='match_id')
    df = df.reset_index(drop=True)
    return df

In [14]:
# https://www.kaggle.com/code/columbia2131/foursquare-iou-metrics
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame, org_data):
    scores = []
    id2poi = get_id2poi(org_data)
    poi2ids = get_poi2ids(org_data)
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def calc_max_score(tr_data, org_data):
    train_candidate = pd.DataFrame()
    train_candidate['id'] = org_data['id'].unique()
    train_candidate['matches'] = org_data['id'].unique()
    idx = tr_data['point_of_interest']==tr_data['match_point_of_interest']
    train_match = tr_data.loc[idx].groupby('id')['match_id'].apply(list).map(" ".join).reset_index()
    train_match.columns = ['id','candidates']
    train_candidate = train_candidate.merge(train_match, on = 'id', how = 'left')
    idx = ~train_candidate['candidates'].isna()
    train_candidate.loc[idx, "matches"] += " " + train_candidate.loc[idx, "candidates"]
    score = get_score(train_candidate, org_data)
    print('1st_stage_max_score : ' + '{:.5f}'.format(score))
    return score

In [15]:
train_origin = pd.read_csv(INPUT_DIR + "train.csv")
train_origin = preprocess(train_origin)

# trainデータの分割
kf = GroupKFold(n_splits=2)
for i, (trn_idx, val_idx) in enumerate(kf.split(train_origin, train_origin['point_of_interest'], train_origin['point_of_interest'])):
    train_origin = train_origin.loc[val_idx].copy()
    break
train_origin = train_origin.reset_index(drop=True)

In [16]:
conds = []
scores = []

In [17]:
cond = "nameのみ"
train = extract_candidate_tfidf_sim(train_origin, "name")
train = train.merge(train_origin[["id", "point_of_interest"]].add_prefix("match_"), on="match_id", how="left")
score = calc_max_score(train, train_origin)
conds.append(cond)
scores.append(score)

100%|██████████| 210/210 [00:36<00:00,  5.80it/s]


1st_stage_max_score : 0.88423


In [18]:
cond = "name + address"
train_origin["text"] = train_origin["name"].replace("nan", "") + " " + \
                       train_origin["address"].replace("nan", "")
train = extract_candidate_tfidf_sim(train_origin, "text")
train = train.merge(train_origin[["id", "point_of_interest"]].add_prefix("match_"), on="match_id", how="left")
score = calc_max_score(train, train_origin)
conds.append(cond)
scores.append(score)

100%|██████████| 210/210 [00:43<00:00,  4.86it/s]


1st_stage_max_score : 0.90294


In [19]:
cond = "name + city"
train_origin["text"] = train_origin["name"].replace("nan", "") + " " + \
                       train_origin["city"].replace("nan", "")
train = extract_candidate_tfidf_sim(train_origin, "text")
train = train.merge(train_origin[["id", "point_of_interest"]].add_prefix("match_"), on="match_id", how="left")
score = calc_max_score(train, train_origin)
conds.append(cond)
scores.append(score)

100%|██████████| 210/210 [00:38<00:00,  5.44it/s]


1st_stage_max_score : 0.90803


In [20]:
cond = "name + state"
train_origin["text"] = train_origin["name"].replace("nan", "") + " " + \
                       train_origin["state"].replace("nan", "")
train = extract_candidate_tfidf_sim(train_origin, "text")
train = train.merge(train_origin[["id", "point_of_interest"]].add_prefix("match_"), on="match_id", how="left")
score = calc_max_score(train, train_origin)
conds.append(cond)
scores.append(score)

100%|██████████| 210/210 [00:37<00:00,  5.66it/s]


1st_stage_max_score : 0.89184


In [21]:
cond = "name + url"
train_origin["text"] = train_origin["name"].replace("nan", "") + " " + \
                       train_origin["url"].replace("nan", "")
train = extract_candidate_tfidf_sim(train_origin, "text")
train = train.merge(train_origin[["id", "point_of_interest"]].add_prefix("match_"), on="match_id", how="left")
score = calc_max_score(train, train_origin)
conds.append(cond)
scores.append(score)

100%|██████████| 210/210 [00:42<00:00,  4.91it/s]


1st_stage_max_score : 0.88436


In [22]:
cond = "name + categories"
train_origin["text"] = train_origin["name"].replace("nan", "") + " " + \
                       train_origin["categories"].replace("nan", "")
train = extract_candidate_tfidf_sim(train_origin, "text")
train = train.merge(train_origin[["id", "point_of_interest"]].add_prefix("match_"), on="match_id", how="left")
score = calc_max_score(train, train_origin)
conds.append(cond)
scores.append(score)

100%|██████████| 210/210 [00:46<00:00,  4.51it/s]


1st_stage_max_score : 0.86467


In [23]:
cond = "name + address + city"
train_origin["text"] = train_origin["name"].replace("nan", "") + " " + \
                       train_origin["address"].replace("nan", "") + " " + \
                       train_origin["city"].replace("nan", "")
train = extract_candidate_tfidf_sim(train_origin, "text")
train = train.merge(train_origin[["id", "point_of_interest"]].add_prefix("match_"), on="match_id", how="left")
score = calc_max_score(train, train_origin)
conds.append(cond)
scores.append(score)

100%|██████████| 210/210 [00:49<00:00,  4.26it/s]


1st_stage_max_score : 0.91179


In [24]:
cond = "name + address + city + state"
train_origin["text"] = train_origin["name"].replace("nan", "") + " " + \
                       train_origin["address"].replace("nan", "") + " " + \
                       train_origin["city"].replace("nan", "") + " " + \
                       train_origin["state"].replace("nan", "") 
train = extract_candidate_tfidf_sim(train_origin, "text")
train = train.merge(train_origin[["id", "point_of_interest"]].add_prefix("match_"), on="match_id", how="left")
score = calc_max_score(train, train_origin)
conds.append(cond)
scores.append(score)

100%|██████████| 210/210 [00:52<00:00,  4.02it/s]


1st_stage_max_score : 0.90675


In [29]:
result = pd.DataFrame()
result["条件"] = conds
result["1st_max_score"] = scores
result

Unnamed: 0,条件,1st_max_score
0,nameのみ,0.884232
1,name + address,0.902943
2,name + city,0.908033
3,name + state,0.891844
4,name + url,0.88436
5,name + categories,0.864667
6,name + address + city,0.911794
7,name + address + city + state,0.906749


tfidf条件

In [26]:
def extract_candidate_tfidf_sim2(df, col, ngram_range, analyzer, use_idf):
    dfs = []
    for country, country_df in tqdm(df.groupby("country")):
        country_df = country_df[country_df[col]!="nan"].copy()
        if len(country_df) < 2:
            continue

        country_df = country_df.reset_index(drop=True)
        
        model = TfidfVectorizer(ngram_range=ngram_range, analyzer=analyzer, use_idf=use_idf)
        text_embeddings = model.fit_transform(country_df[col].tolist())

        model = NearestNeighbors_gpu(n_neighbors=min(len(country_df), N_NEIGHBORS), algorithm="brute")
        model.fit(text_embeddings)
        nears = model.kneighbors(text_embeddings, return_distance=False)
        
        k = min(len(country_df), N_NEIGHBORS)
        country_df['match_id'] = country_df['id'].values[nears[:, :k]].tolist()
        country_df = country_df.explode(['match_id'])
        country_df = country_df.loc[country_df['id'] != country_df['match_id']].copy()
        dfs.append(country_df)
    df = pd.concat(dfs).reset_index(drop=True)
    return df

In [35]:
train_origin["text"] = train_origin["name"].replace("nan", "") + " " + \
                       train_origin["address"].replace("nan", "") + " " + \
                       train_origin["city"].replace("nan", "")

In [36]:
ngrams = [(1,1), (1,2), (1,3), (2,2), (2,3), (3,3)]
analyzers = ["word", "char", "char_wb"]
use_idfs = [True, False]

In [38]:
import datetime

In [39]:
results = []
for ngram, analyzer, use_idf in itertools.product(ngrams, analyzers, use_idfs):
    st_time = datetime.datetime.now()
    train = extract_candidate_tfidf_sim2(train_origin, "text", ngram, analyzer, use_idf)
    ed_time = datetime.datetime.now()
    t = (ed_time - st_time).seconds
    train = train.merge(train_origin[["id", "point_of_interest"]].add_prefix("match_"), on="match_id", how="left")
    score = calc_max_score(train, train_origin)
    print(ngram, analyzer, use_idf, score, t)
    results.append([ngram, analyzer, use_idf, score, t])

100%|██████████| 210/210 [00:28<00:00,  7.49it/s]


1st_stage_max_score : 0.87565
(1, 1) word True 0.8756546269287908 32


100%|██████████| 210/210 [00:26<00:00,  7.84it/s]


1st_stage_max_score : 0.86014
(1, 1) word False 0.8601403807249306 31


100%|██████████| 210/210 [00:31<00:00,  6.67it/s]


1st_stage_max_score : 0.81241
(1, 1) char True 0.8124075356143731 35


100%|██████████| 210/210 [00:31<00:00,  6.67it/s]


1st_stage_max_score : 0.80656
(1, 1) char False 0.806555327536221 36


100%|██████████| 210/210 [00:37<00:00,  5.64it/s]


1st_stage_max_score : 0.80972
(1, 1) char_wb True 0.8097237420495128 41


100%|██████████| 210/210 [00:36<00:00,  5.81it/s]


1st_stage_max_score : 0.80159
(1, 1) char_wb False 0.8015947364919042 40


100%|██████████| 210/210 [00:34<00:00,  6.04it/s]


1st_stage_max_score : 0.84407
(1, 2) word True 0.8440746486946918 39


100%|██████████| 210/210 [00:34<00:00,  6.15it/s]


1st_stage_max_score : 0.84055
(1, 2) word False 0.8405519556668388 38


100%|██████████| 210/210 [00:53<00:00,  3.96it/s]


1st_stage_max_score : 0.88938
(1, 2) char True 0.8893813452852076 57


100%|██████████| 210/210 [00:52<00:00,  4.01it/s]


1st_stage_max_score : 0.85952
(1, 2) char False 0.8595237243217336 56


100%|██████████| 210/210 [00:59<00:00,  3.52it/s]


1st_stage_max_score : 0.88777
(1, 2) char_wb True 0.8877744349540051 64


100%|██████████| 210/210 [00:58<00:00,  3.57it/s]


1st_stage_max_score : 0.85429
(1, 2) char_wb False 0.85429435487618 63


100%|██████████| 210/210 [00:42<00:00,  4.97it/s]


1st_stage_max_score : 0.83067
(1, 3) word True 0.830667381701086 46


100%|██████████| 210/210 [00:41<00:00,  5.09it/s]


1st_stage_max_score : 0.82717
(1, 3) word False 0.8271708325805838 45


100%|██████████| 210/210 [01:30<00:00,  2.32it/s]


1st_stage_max_score : 0.90554
(1, 3) char True 0.9055365439915224 95


100%|██████████| 210/210 [01:29<00:00,  2.35it/s]


1st_stage_max_score : 0.87515
(1, 3) char False 0.875148037949855 93


100%|██████████| 210/210 [01:36<00:00,  2.18it/s]


1st_stage_max_score : 0.90653
(1, 3) char_wb True 0.9065344007427815 100


100%|██████████| 210/210 [01:34<00:00,  2.22it/s]


1st_stage_max_score : 0.86942
(1, 3) char_wb False 0.8694207742507833 99


100%|██████████| 210/210 [00:30<00:00,  6.98it/s]


1st_stage_max_score : 0.74698
(2, 2) word True 0.7469842611517388 34


100%|██████████| 210/210 [00:29<00:00,  7.14it/s]


1st_stage_max_score : 0.75106
(2, 2) word False 0.7510552019406153 33


100%|██████████| 210/210 [00:42<00:00,  5.00it/s]


1st_stage_max_score : 0.89770
(2, 2) char True 0.8977027705323352 46


100%|██████████| 210/210 [00:41<00:00,  5.08it/s]


1st_stage_max_score : 0.88224
(2, 2) char False 0.8822405478818399 45


100%|██████████| 210/210 [00:44<00:00,  4.67it/s]


1st_stage_max_score : 0.89997
(2, 2) char_wb True 0.8999721369042379 49


100%|██████████| 210/210 [00:44<00:00,  4.70it/s]


1st_stage_max_score : 0.88538
(2, 2) char_wb False 0.8853826318848681 49


100%|██████████| 210/210 [00:37<00:00,  5.63it/s]


1st_stage_max_score : 0.73044
(2, 3) word True 0.7304437101309482 41


100%|██████████| 210/210 [00:37<00:00,  5.66it/s]


1st_stage_max_score : 0.73801
(2, 3) word False 0.73800967949315 41


100%|██████████| 210/210 [01:15<00:00,  2.78it/s]


1st_stage_max_score : 0.90768
(2, 3) char True 0.9076845081205892 79


100%|██████████| 210/210 [01:14<00:00,  2.83it/s]


1st_stage_max_score : 0.88958
(2, 3) char False 0.889578661825101 78


100%|██████████| 210/210 [01:16<00:00,  2.74it/s]


1st_stage_max_score : 0.91032
(2, 3) char_wb True 0.9103175165368081 81


100%|██████████| 210/210 [01:15<00:00,  2.78it/s]


1st_stage_max_score : 0.89274
(2, 3) char_wb False 0.8927355327239233 80


 80%|███████▉  | 167/210 [00:15<00:03, 11.12it/s]


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [40]:
result = pd.DataFrame(results, columns=["ngram_range", "analyzer", "use_idf", "1st_max_score", "time"])
result

Unnamed: 0,ngram_range,analyzer,use_idf,1st_max_score,time
0,"(1, 1)",word,True,0.875655,32
1,"(1, 1)",word,False,0.86014,31
2,"(1, 1)",char,True,0.812408,35
3,"(1, 1)",char,False,0.806555,36
4,"(1, 1)",char_wb,True,0.809724,41
5,"(1, 1)",char_wb,False,0.801595,40
6,"(1, 2)",word,True,0.844075,39
7,"(1, 2)",word,False,0.840552,38
8,"(1, 2)",char,True,0.889381,57
9,"(1, 2)",char,False,0.859524,56


In [41]:
print(ngram, analyzer, use_idf, score, t)

(3, 3) word True 0.8927355327239233 80
