# exp034_trial
bert類似度計算の高速化

In [61]:
MODE = 'local_train'
#MODE = 'kaggle_inference'

In [62]:
exp_name = 'exp032'
memo = 'exp030 dist10+tfidf_name_sim10の1st'

In [63]:
import os
import sys
import gc

if MODE == 'local_train':
    sys.path.append('/home/kaggler/.local/lib/python3.8/site-packages')
    from dotenv import load_dotenv
    load_dotenv
    sys.path.append(os.getenv('UTILS_PATH'))
    import line_notify
    import slack_notify
    
if MODE == "kaggle_inference":
    from cuml import ForestInference
    import treelite
    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt; plt.style.use("ggplot")
import seaborn as sns
from sklearn.metrics.pairwise import haversine_distances
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
import lightgbm as lgb
import itertools
from scipy.spatial.distance import canberra
from sklearn.neighbors import KNeighborsRegressor
import functools
import multiprocessing
import Levenshtein
import difflib
import pickle
from tqdm import tqdm
%load_ext Cython

from transformers import DistilBertModel, DistilBertTokenizer
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torch.nn as nn
import torch

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

from cuml.feature_extraction.text import TfidfVectorizer as TfidfVectorizer_gpu
import cudf, cuml, cupy
from cuml.neighbors import NearestNeighbors as NearestNeighbors_gpu

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [64]:
# directry_setting
if MODE == 'local_train':
    INPUT_DIR = os.getenv('INPUT_DIR')
    OUTPUT_DIR = os.getenv('OUTPUT_DIR')
    MODEL_DIR = os.getenv('OUTPUT_DIR')
    BERT_MODEL = "distilbert-base-multilingual-cased"
    os.makedirs(OUTPUT_DIR + exp_name, exist_ok=True)

elif MODE == 'kaggle_inference':
    INPUT_DIR = '/kaggle/input/foursquare-location-matching/'
    OUTPUT_DIR = './'
    MODEL_DIR = f'../input/fs{exp_name}/'
    BERT_MODEL = "../input/distilbertbaseuncased"

In [65]:
# CONFIG
SEED = 42
N_NEIGHBORS = 10
N_SPLITS = 5
PROB_TH = 0.5
MAX_LEN = 32
BS = 512
NW = 2
SVD_N_COMP = 50

In [66]:
class Cat2VecModel(nn.Module):
    def __init__(self):
        super(Cat2VecModel, self).__init__()
        self.distill_bert = DistilBertModel.from_pretrained(BERT_MODEL)
        
    def forward(self, ids, mask):
        x = self.distill_bert(ids, mask)[0]
        x = F.normalize((x[:, 1:, :]*mask[:, 1:, None]).mean(axis=1))
        return x

class InferenceDataset(Dataset):
    
    def __init__(self, df, max_len, col):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.max_len = max_len
        self.tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True)
        self.col = col

    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        inputs = self.tokenizer.encode_plus(
            row[self.col],
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = torch.LongTensor(inputs['input_ids'])
        mask = torch.LongTensor(inputs['attention_mask'])

        return ids, mask

    def __len__(self):
        return self.df.shape[0]

def inference(ds):
    cat2vec_model = Cat2VecModel()
    cat2vec_model = cat2vec_model.cuda()
    
    loader = DataLoader(ds, batch_size=BS, shuffle=False, num_workers=NW,
                        pin_memory=False, drop_last=False)
    
    vs = []
    with torch.no_grad():
        for idx, (ids, masks) in enumerate(loader):
            v = cat2vec_model(ids.cuda(), masks.cuda()).detach().cpu().numpy()
            vs.append(v)
    return np.concatenate(vs)

In [67]:
def make_bert_vec(df, col):
    cat_df = df[[col]].drop_duplicates()
    cat_df[col] = cat_df[col].fillna("null")

    cat_ds = InferenceDataset(cat_df, max_len=MAX_LEN, col=col)
    V = inference(cat_ds)
    #svd = TruncatedSVD(n_components=SVD_N_COMP, random_state=SEED)
    #V = svd.fit_transform(V)
    V = V.astype("float16")
    bert_vec = {k:v for k,v in zip(cat_df[col].values, V)}
    return bert_vec

In [68]:
def preprocess(df):
    columns = ['id', 'name', 'address', 'city', 'state',
        'zip', 'country', 'url', 'phone', 'categories']
    for c in columns:
        if c != "id":
            df[c] = df[c].astype(str).str.lower()

    df[["latitude", "longitude"]] = np.deg2rad(df[["latitude", "longitude"]])
    
    return df

In [69]:
def extract_candidate_dist(df):
    dfs = []
    for country, country_df in tqdm(df.groupby("country")):
        country_df = country_df.reset_index(drop=True)
        
        knn = KNeighborsRegressor(n_neighbors=min(len(country_df), N_NEIGHBORS), 
                                    metric='haversine', n_jobs=-1)
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        nears = knn.kneighbors(country_df[['latitude','longitude']], return_distance=False)
        
        k = min(len(country_df), N_NEIGHBORS)
        country_df['match_id'] = country_df['id'].values[nears[:, :k]].tolist()
        country_df = country_df.explode(['match_id'])
        country_df = country_df.loc[country_df['id'] != country_df['match_id']].copy()
        dfs.append(country_df)
    df = pd.concat(dfs).reset_index(drop=True)
    return df

In [70]:
def extract_candidate_tfidf_sim(df, col):
    dfs = []
    for country, country_df in tqdm(df.groupby("country")):
        country_df = country_df[country_df[col]!="nan"].copy()
        if len(country_df) < 2:
            continue

        country_df = country_df.reset_index(drop=True)
        
        model = TfidfVectorizer_gpu(stop_words='english', binary=True)
        text_embeddings = model.fit_transform(cudf.from_pandas(country_df[col]))

        model = NearestNeighbors_gpu(n_neighbors=min(len(country_df), N_NEIGHBORS), algorithm="brute")
        model.fit(text_embeddings)
        nears = model.kneighbors(text_embeddings, return_distance=False)
        
        k = min(len(country_df), N_NEIGHBORS)
        country_df['match_id'] = country_df['id'].values[nears[:, :k].get()].tolist()
        country_df = country_df.explode(['match_id'])
        country_df = country_df.loc[country_df['id'] != country_df['match_id']].copy()
        dfs.append(country_df)
    df = pd.concat(dfs).reset_index(drop=True)
    return df

In [71]:
def add_orgin_data(df, org_df):
    df = df.merge(org_df.add_prefix('match_'), on='match_id')
    df = df.reset_index(drop=True)
    return df

In [72]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [73]:
%%cython
# Optimized version
import numpy as np
cimport numpy as np

import cython
from libc.stdlib cimport malloc, free


@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False)  # turn off negative index wrapping for entire function
def fast_LCS(str S, str T):
    if len(S) < len(T):
        S, T = T, S

    cdef int i, j
    cdef np.uint16_t[:] dp_prev, dp_curr
    
    dp_prev = np.zeros(len(T) + 1, dtype=np.uint16)
    dp_curr = np.zeros(len(T) + 1, dtype=np.uint16)

    for i in range(len(S)):
        for j in range(len(T)):
            dp_curr[j + 1]  = max(dp_prev[j] + (1 if S[i] == T[j] else 0), dp_curr[j], dp_prev[j + 1])
        dp_prev, dp_curr = dp_curr, dp_prev
    return dp_prev[len(T)]

In [74]:
def make_tfidf_vec(df, col):
    df_ = pd.concat([df[col], df["match_" + col]]).drop_duplicates().to_frame()
    df_ = df_.reset_index(drop=True)
    df_.columns = [col]

    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words='english')
    tfidf_vec = tfidf_vectorizer.fit_transform(df_[col].tolist())
    tfidf_vec = {k:v for k,v in zip(df_[col].values, tfidf_vec)}
    return tfidf_vec

In [75]:
def _add_distance_features(args):
    _, df = args

    columns = ['name', 'address', 'city', 'state',
               'zip', 'url', 'phone', 'categories']

    for c in columns:
        geshs = []
        levens = []
        jaros = []
        lcss = []
        tfidf_sims = []

        #if c in ["name", "categories"]:
        #    tfidf_vec = make_tfidf_vec(df, c)


        for str1, str2 in df[[f"{c}", f"match_{c}"]].values.astype(str):
            if str1 != "nan" and str2 != "nan":
                geshs.append(difflib.SequenceMatcher(None, str1, str2).ratio())
                levens.append(Levenshtein.distance(str1, str2))
                jaros.append(Levenshtein.jaro_winkler(str1, str2))
                lcss.append(LCS(str(str1), str(str2)))

                #if c in ["name", "categories"]:
                #    sim = cosine_similarity(tfidf_vec[str1].reshape(1, -1), tfidf_vec[str2].reshape(1, -1))[0][0]
                #    tfidf_sims.append(sim)

            else:
                geshs.append(-1)
                levens.append(-1)
                jaros.append(-1)
                lcss.append(-1)
                
                #if c in ["name", "categories"]:
                #    tfidf_sims.append(-1)


        df[f"match_{c}_gesh"] = geshs
        df[f"match_{c}_gesh"] = df[f"match_{c}_gesh"].astype(np.float16)
        df[f"match_{c}_leven"] = levens
        #df[f"match_{c}_leven"] = df[f"match_{c}_leven"].astype(np.float16)
        df[f"match_{c}_jaro"] = jaros
        df[f"match_{c}_jaro"] = df[f"match_{c}_jaro"].astype(np.float16)
        df[f"match_{c}_lcs"] = lcss
        #df[f"match_{c}_lcs"] = df[f"match_{c}_lcs"].astype(np.float16)
            
        if not c in ['country', "phone", "zip"]:
            df[f"match_{c}_len"] = df[f"match_{c}"].astype(str).map(len)
            df[f"{c}_len"] = df[f"{c}"].astype(str).map(len)
            df[f"match_{c}_nleven"] = df[f"match_{c}_leven"] / df[[f"match_{c}_len", f"{c}_len"]].max(axis=1)
            df[f"match_{c}_nleven"] = df[f"match_{c}_nleven"].astype(np.float16)
            df[f"match_{c}_nlcsi"] = df[f"match_{c}_lcs"] / df[f"match_{c}_len"]
            df[f"match_{c}_nlcs0"] = df[f"match_{c}_lcs"] / df[f"{c}_len"]
            df[f"match_{c}_nlcsi"] = df[f"match_{c}_nlcsi"].astype(np.float16)
            df[f"match_{c}_nlcs0"] = df[f"match_{c}_nlcs0"].astype(np.float16)
            df.drop(f'{c}_len',axis=1, inplace = True)
            df.drop(f"match_{c}_len",axis=1, inplace = True)

        #if c in ["name", "categories"]:
        #    df[f"tfidf_sim_{c}"] = tfidf_sims
        #    df[f"tfidf_sim_{c}"] = df[f"tfidf_sim_{c}"].astype(np.float16)

    return df


def add_distance_features(df):
    processes = multiprocessing.cpu_count()
    with multiprocessing.Pool(processes=processes) as pool:
        dfs = pool.imap_unordered(_add_distance_features, df.groupby('country'))
        dfs = tqdm(dfs)
        dfs = list(dfs)
    df = pd.concat(dfs)
    return df

In [76]:
def _add_distance_features_new(args):
    _, df = args

    columns = ['name', 'address', 'city', 'state',
               'zip', 'url', 'phone', 'categories']

    for c in columns:
        geshs = []
        levens = []
        jaros = []
        lcss = []
        tfidf_sims = []

        #if c in ["name", "categories"]:
        #    tfidf_vec = make_tfidf_vec(df, c)


        for str1, str2 in df[[f"{c}", f"match_{c}"]].values.astype(str):
            if str1 != "nan" and str2 != "nan":
                geshs.append(difflib.SequenceMatcher(None, str1, str2).ratio())
                levens.append(Levenshtein.distance(str1, str2))
                jaros.append(Levenshtein.jaro_winkler(str1, str2))
                lcss.append(fast_LCS(str(str1), str(str2)))

                #if c in ["name", "categories"]:
                #    sim = cosine_similarity(tfidf_vec[str1].reshape(1, -1), tfidf_vec[str2].reshape(1, -1))[0][0]
                #    tfidf_sims.append(sim)

            else:
                geshs.append(-1)
                levens.append(-1)
                jaros.append(-1)
                lcss.append(-1)
                
                #if c in ["name", "categories"]:
                #    tfidf_sims.append(-1)


        df[f"match_{c}_gesh"] = geshs
        df[f"match_{c}_gesh"] = df[f"match_{c}_gesh"].astype(np.float16)
        df[f"match_{c}_leven"] = levens
        #df[f"match_{c}_leven"] = df[f"match_{c}_leven"].astype(np.float16)
        df[f"match_{c}_jaro"] = jaros
        df[f"match_{c}_jaro"] = df[f"match_{c}_jaro"].astype(np.float16)
        df[f"match_{c}_lcs"] = lcss
        #df[f"match_{c}_lcs"] = df[f"match_{c}_lcs"].astype(np.float16)
            
        if not c in ['country', "phone", "zip"]:
            df[f"match_{c}_len"] = df[f"match_{c}"].astype(str).map(len)
            df[f"{c}_len"] = df[f"{c}"].astype(str).map(len)
            df[f"match_{c}_nleven"] = df[f"match_{c}_leven"] / df[[f"match_{c}_len", f"{c}_len"]].max(axis=1)
            df[f"match_{c}_nleven"] = df[f"match_{c}_nleven"].astype(np.float16)
            df[f"match_{c}_nlcsi"] = df[f"match_{c}_lcs"] / df[f"match_{c}_len"]
            df[f"match_{c}_nlcs0"] = df[f"match_{c}_lcs"] / df[f"{c}_len"]
            df[f"match_{c}_nlcsi"] = df[f"match_{c}_nlcsi"].astype(np.float16)
            df[f"match_{c}_nlcs0"] = df[f"match_{c}_nlcs0"].astype(np.float16)
            df.drop(f'{c}_len',axis=1, inplace = True)
            df.drop(f"match_{c}_len",axis=1, inplace = True)

        #if c in ["name", "categories"]:
        #    df[f"tfidf_sim_{c}"] = tfidf_sims
        #    df[f"tfidf_sim_{c}"] = df[f"tfidf_sim_{c}"].astype(np.float16)

    return df


def add_distance_features_new(df):
    processes = multiprocessing.cpu_count()
    with multiprocessing.Pool(processes=processes) as pool:
        dfs = pool.imap_unordered(_add_distance_features_new, df.groupby('country'))
        dfs = tqdm(dfs)
        dfs = list(dfs)
    df = pd.concat(dfs)
    return df

In [77]:
def add_vec_sim_features(df, vec, vec_name, col):
    sim_list = []
    for str1, str2 in tqdm(df[[f"{col}", f"match_{col}"]].values.astype(str)):
        if str1!="nan" and str2!="nan":
            sim = cosine_similarity(vec[str1].reshape(1, -1), vec[str2].reshape(1, -1))[0][0]
        else:
            sim = -1
        sim_list.append(sim)
    df[f"{vec_name}_sim_{col}"] = sim_list
    df[f"{vec_name}_sim_{col}"] = df[f"{vec_name}_sim_{col}"].astype(np.float16)
    return df

In [78]:
def add_vec_sim_features_new(df, vec, vec_name, col):
    sim_list = []
    for str1, str2 in tqdm(df[[f"{col}", f"match_{col}"]].values.astype(str)):
        if str1!="nan" and str2!="nan":
            sim = dot(vec[str1], vec[str2]) / (norm(vec[str1])*norm(vec[str2]))
        else:
            sim = -1
        sim_list.append(sim)
    df[f"{vec_name}_sim_{col}"] = sim_list
    return df

In [79]:
train_origin = pd.read_csv(INPUT_DIR + "train.csv")
train_origin = preprocess(train_origin)

In [80]:
bert_vec_name = make_bert_vec(train_origin[["name"]], "name")

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [81]:
kf = GroupKFold(n_splits=5)
for i, (trn_idx, val_idx) in enumerate(kf.split(train_origin, train_origin['country'], train_origin['country'])):
        bert_vec_name_batch = make_bert_vec(train_origin.loc[val_idx][["name"]], "name")

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_l

KeyboardInterrupt: 

In [None]:
str1 = "h&m rue neuve"
str2 = "h&m"

In [None]:
%%timeit
sim = cosine_similarity(bert_vec_name[str1].reshape(1, -1), bert_vec_name[str2].reshape(1, -1))[0][0]

116 µs ± 647 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [None]:
cosine_similarity(bert_vec_name[str1].reshape(1, -1), bert_vec_name[str2].reshape(1, -1))[0][0]

0.6982308825235138

In [None]:
dot(bert_vec_name[str1], bert_vec_name[str2]) / (norm(bert_vec_name[str1])*norm(bert_vec_name[str2]))

0.698

In [None]:
%%timeit
sim = cosine_similarity(bert_vec_name[str1].reshape(1, -1), bert_vec_name[str2].reshape(1, -1))[0][0]

113 µs ± 287 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [None]:
from numpy import dot
from numpy.linalg import norm

In [None]:
%%timeit
sim = dot(bert_vec_name[str1], bert_vec_name[str2]) / (norm(bert_vec_name[str1])*norm(bert_vec_name[str2]))

16.6 µs ± 20.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [None]:
# trainデータの分割
kf = GroupKFold(n_splits=20)
for i, (trn_idx, val_idx) in enumerate(kf.split(train_origin, train_origin['point_of_interest'], train_origin['point_of_interest'])):
    train_origin.loc[val_idx, "set"] = i

# 1st stage
train = extract_candidate_dist(train_origin[train_origin["set"]==0])
train = add_orgin_data(train, train_origin)

old = train.copy()
new = train.copy()

100%|██████████| 151/151 [00:04<00:00, 35.83it/s]


In [None]:
%%time
old = add_vec_sim_features(old, bert_vec_name, "bert", "name")

100%|██████████| 511642/511642 [01:02<00:00, 8229.64it/s]

CPU times: user 1min 2s, sys: 565 ms, total: 1min 3s
Wall time: 1min 2s





In [None]:
%%time
new = add_vec_sim_features_new(new, bert_vec_name, "bert", "name")

100%|██████████| 511642/511642 [00:09<00:00, 51399.36it/s]

CPU times: user 10.2 s, sys: 95.8 ms, total: 10.3 s
Wall time: 10.2 s





In [None]:
old["bert_sim_name"]

0         1.000000
1         0.415771
2         0.461182
3         0.461182
4         0.529785
            ...   
511637    0.563965
511638    0.545410
511639    0.472656
511640    0.451660
511641    0.451660
Name: bert_sim_name, Length: 511642, dtype: float16

In [None]:
new["bert_sim_name"]

0         1.000000
1         0.415771
2         0.461182
3         0.461182
4         0.529785
            ...   
511637    0.563965
511638    0.545410
511639    0.472656
511640    0.451660
511641    0.451660
Name: bert_sim_name, Length: 511642, dtype: float16

In [None]:
def add_vec_sim_features_new(df, vec, vec_name, col):
    sim_list = []
    for str1, str2 in tqdm(df[[f"{col}", f"match_{col}"]].values.astype(str)):
        if str1!="nan" and str2!="nan":
            sim = dot(vec[str1], vec[str2]) / (norm(vec[str1])*norm(vec[str2]))
        else:
            sim = -1
        sim_list.append(sim)
    df[f"{vec_name}_sim_{col}"] = sim_list
    return df