## 変更点

- run()を add_all_features(), split_train_test_pair() と train() に分離
- train() でモデル名を指定して学習と推論
- 予測値は confidence_{model_name} に出力

## Change Logs

- shopee-submission-pipeline_v0507-1100.ipynb
    - テキストの正規化処理を追加
    - CV F1: 0.7555
- shopee-submission-pipeline_v0507-1200.ipynb
    - グラフ特徴量を追加
    - CV F1: 0.7799
- shopee-submission-pipeline_v0507-1300.ipynb
    - CV F1: 0.7809
- shopee-submission-pipeline_v0507-2100.ipynb
    - CV F1: 0.7812
- shopee-submission-pipeline_v0507-2200.ipynb
    - CV F1: 0.7817
    - effnet b3 512x512をembeddins抽出に追加
- shopee-submission-pipeline_v0508-0900.ipynb
    - CV F1: 0.7826
    - 類似度同士の積を特徴量として追加
    - コサイン類似度のメモリ節約&高速化
- shopee-submission-pipeline_v0508-1500.ipynb
    - CV F1: 0.7837
    - TF-IDFでstop wordsやbinary=True, max_featuresを追加
- 👑  shopee-submission-pipeline_v0508-1600.ipynb
    - CV F1: 0.7848
    - num kfolds を 2 -> 5に変更

In [209]:
import warnings
warnings.filterwarnings('ignore')

import os
import sys
import pandas as pd

if "google.colab" in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    %cd /content/drive/MyDrive/git/kaggle-shopee/working

    !pip install python-Levenshtein
    !pip install tensorflow-addons
    !pip install timm
    
    #!pip install -U pandas
    !pip install -U xgboost
    !pip install -U albumentations
    !pip install -U python-igraph
    !pip install -U unidecode
    !pip install catboost
    !pip install -U transformers

    if not os.path.exists("/content/shopee-product-matching"):
        os.makedirs("/content/shopee-product-matching")
        !cp ../input/shopee-product-matching.zip /content/shopee-product-matching
        !unzip /content/shopee-product-matching/shopee-product-matching.zip -d /content/shopee-product-matching

#sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')
sys.path.append("/content/drive/MyDrive/git/kaggle-shopee/input/shopee-toolkit")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/git/kaggle-shopee/working
[31mERROR: Operation cancelled by user[0m
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3021, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.7/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2815, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/pip/_internal/cli/base_command.py", line 153, in _main
    status = self.run(options, args)
  File "/usr/local/lib/python3.7/dist-packages/pip/_internal/commands/install.py", line 438, in run
    self._warn_about_conflicts(to_install)
  File "/usr/local/lib/python

In [211]:

import shopee
'''
%load_ext autoreload
%autoreload
import shopee
'''


'\n%load_ext autoreload\n%autoreload\nimport shopee\n'

In [219]:
class Config:
    def __init__(self, debug:bool):
        self.debug = debug # Trueならメモリや実行時間を超えないか検証
        self.suffix = "_debug" if self.debug else ""
        self.train_nrows = 1000
        self.image_size = 256
        self.batch_size = 32
        self.num_kfolds = 5
        self.max_candidates = 20
        self.max_seq_length = 100
        self.use_graph_features = True
        self.use_cache = True
        self.weight_rate = 1 # xgbモデル学習時に利用するpositiveクラスの重み
        self.use_fast_neighbors = True # 候補点抽出時にGPUを使うかどうか(メモリ使用量大)
        # 画像特徴量抽出アルゴリズム
        self.entry_ids = [
            # "effnet-b0_512x512",
            "effnet-b3_512x512",
            # "effnet-b5_512x512",
            "effnet-b0_256x256",
            # "effnet-b3_256x256",
            # "effnet-b5_256x256",
            # "mobilenet-v2_256x256",
            "tfidf-v1",
            # "roberta-base",
            # "bert-base-uncased",
            # "bert-base-multilingual-uncased",
        ]
        self.feature_names = [
            "edit_distance",
            "feat_effnet-b0_256x256",
            "feat_effnet-b0_256x256_x_feat_tfidf-v1",
            "feat_effnet-b3_512x512",
            "feat_effnet-b3_512x512_x_feat_effnet-b0_256x256",
            "feat_effnet-b3_512x512_x_feat_tfidf-v1",
            "feat_tfidf-v1",
        ]
        if self.use_graph_features:
            self.feature_names += [
                "neighborhood_size1_pid","neighborhood_size1_cpid","neighborhood_size1_diff",
                "neighborhood_size2_pid","neighborhood_size2_cpid","neighborhood_size2_diff",
                "authority_score_pid","authority_score_cpid","authority_score_diff",
                "constraint_pid","constraint_cpid","constraint_diff",
                "pagerank_pid", "pagerank_cpid", "pagerank_diff",
                "strength_pid","strength_cpid","strength_diff",
                "transitivity_local_undirected_pid", "transitivity_local_undirected_cpid", "transitivity_local_undirected_diff",
                "intersection_neighbors_num", "union_neighbors_num", "xor_neighbors_num", "intersection_neighbors_rate"
            ]
        self.data_directory = "../data"
        self.pair_filepath = os.path.join(
            self.data_directory,
            f"pair_matching{self.suffix}.csv"
        )

config = Config(debug=False)
os.makedirs(config.data_directory, exist_ok=True)

## Load dataset

In [220]:
train_df = shopee.datasets.load_train_data()
if config.debug:
    train_df = train_df.head(100)
test_df = shopee.datasets.load_test_data()

if "google.colab" in sys.modules:
    train_df["filepath"] = train_df["image"].apply(lambda _: os.path.join("/content/shopee-product-matching/train_images", _))
    test_df["filepath"] = test_df["image"].apply(lambda _: os.path.join("/content/shopee-product-matching/test_images", _))

## Pair matching


In [221]:
%%time
pair_df = shopee.matching.make_candidates(
    debug=config.debug,
    train_df=train_df,
    test_df=test_df,
    use_cache=config.use_cache,
    entry_ids=config.entry_ids,
    max_candidates=config.max_candidates,
    max_seq_length=config.max_seq_length,
    use_fast_neighbors=config.use_fast_neighbors
)

Calculate embeddings with effnet-b3_512x512
Load from ../input/shopee-train-embeddings/train-embeddings-effnet-b3_512x512.npy
{'id': 'effnet-b3_512x512', 'classname': '<function EfficientNetB3 at 0x7ffa7a6b0680>', 'model_type': 'keras-origin', 'weights_filepath': '../input/all-in-one-packages/models/effnet-b3.h5', 'train_embeddings_filepath': '../input/shopee-train-embeddings/train-embeddings-effnet-b3_512x512.npy', 'image_size': 512}
Calculate embeddings with effnet-b0_256x256
Load from ../input/shopee-train-embeddings/train-embeddings-effnet-b0_256x256.npy
{'id': 'effnet-b0_256x256', 'classname': '<function EfficientNetB0 at 0x7ffa7a6b04d0>', 'model_type': 'keras-origin', 'weights_filepath': '../input/all-in-one-packages/models/effnet-b0.h5', 'train_embeddings_filepath': '../input/shopee-train-embeddings/train-embeddings-effnet-b0_256x256.npy', 'image_size': 256}
Calculate embeddings with tfidf-v1
vocab size: 24552
<class 'numpy.ndarray'>
no


100%|██████████| 343/343 [00:01<00:00, 225.75it/s]


<class 'numpy.ndarray'>
no


100%|██████████| 343/343 [00:01<00:00, 238.99it/s]
  1%|▏         | 5/343 [00:00<00:07, 47.38it/s]

<class 'scipy.sparse.csr.csr_matrix'>
ok


100%|██████████| 343/343 [00:07<00:00, 47.59it/s]


candidate indices 34253
calculate distance
CPU times: user 1min 14s, sys: 26.1 s, total: 1min 41s
Wall time: 1min 16s


In [222]:
print(f"Save to {config.pair_filepath}")
pair_df.to_csv(config.pair_filepath, index=False)

Save to ../data/pair_matching.csv


## Training

In [223]:
%%time
""" add_all_features, split_train_test_pair と train に分割
train_pair_df, test_pair_df = shopee.training.run(
    train_df,
    test_df,
    pair_df,
    weight_rate=config.weight_rate,
    use_graph_features=config.use_graph_features,
    num_kfolds=config.num_kfolds,
    feature_names=config.feature_names,
)
"""
shopee.feature_extraction.add_all_features(
    train_df,
    test_df,
    pair_df,
    use_graph_features=config.use_graph_features,
)

train_pair_df, test_pair_df = shopee.training.split_train_test_pair(
    train_df,
    test_df,
    pair_df,
)


making graph features ...
 neighborhood_size1
 neighborhood_size2
 authority_score
 constraint
 pagerank
 strength
 transitivity_local_undirected(clustering coefficient)
 making neighbors set
 intersection
 union
 xor
CPU times: user 1min 18s, sys: 857 ms, total: 1min 19s
Wall time: 1min 19s


In [224]:
# 理論値(正解を突っ込んでるだけなので100%当たった状態)
train_pair_df["prediction"] = train_pair_df["matched"].copy()
shopee.metrics.show_score(train_df, train_pair_df)

positive ratio: 0.1097
total: 201752
TP: 174201
FP: 0
FN: 27551
TN: 1413444
F1: 0.9716


## xgb 

In [225]:
%%time

train_pair_df, test_pair_df = shopee.training.train(
    train_df,
    test_df,
    train_pair_df, test_pair_df,
    weight_rate=config.weight_rate,
    num_kfolds=config.num_kfolds,
    feature_names=config.feature_names,
    model_name="xgb"
)

[0]	validation_0-logloss:0.47899	validation_1-logloss:0.51530
[10]	validation_0-logloss:0.13443	validation_1-logloss:0.25791
[20]	validation_0-logloss:0.11801	validation_1-logloss:0.25096
[30]	validation_0-logloss:0.11198	validation_1-logloss:0.24861
[40]	validation_0-logloss:0.10754	validation_1-logloss:0.25010
[50]	validation_0-logloss:0.10446	validation_1-logloss:0.25102
[0]	validation_0-logloss:0.47832	validation_1-logloss:0.51543
[10]	validation_0-logloss:0.13376	validation_1-logloss:0.26188
[20]	validation_0-logloss:0.11782	validation_1-logloss:0.25743
[30]	validation_0-logloss:0.11185	validation_1-logloss:0.25646
[40]	validation_0-logloss:0.10805	validation_1-logloss:0.25775
[48]	validation_0-logloss:0.10527	validation_1-logloss:0.25785
[0]	validation_0-logloss:0.47831	validation_1-logloss:0.51488
[10]	validation_0-logloss:0.13314	validation_1-logloss:0.25638
[20]	validation_0-logloss:0.11690	validation_1-logloss:0.24959
[30]	validation_0-logloss:0.11055	validation_1-logloss:0.2

In [226]:
test_pair_df.shape

(3, 42)

In [227]:
print("\n".join(sorted(test_pair_df.columns)))

authority_score_cpid
authority_score_diff
authority_score_pid
candidate_neighbors_set
candidate_posting_id
candidate_posting_id_index
candidate_posting_id_phash
confidence_xgb
constraint_cpid
constraint_diff
constraint_pid
edit_distance
feat_effnet-b0_256x256
feat_effnet-b0_256x256_x_feat_tfidf-v1
feat_effnet-b3_512x512
feat_effnet-b3_512x512_x_feat_effnet-b0_256x256
feat_effnet-b3_512x512_x_feat_tfidf-v1
feat_tfidf-v1
intersection_neighbors_num
intersection_neighbors_rate
matched
neighborhood_size1_cpid
neighborhood_size1_diff
neighborhood_size1_pid
neighborhood_size2_cpid
neighborhood_size2_diff
neighborhood_size2_pid
pagerank_cpid
pagerank_diff
pagerank_pid
posting_id
posting_id_index
posting_id_phash
posting_neighbors_set
strength_cpid
strength_diff
strength_pid
transitivity_local_undirected_cpid
transitivity_local_undirected_diff
transitivity_local_undirected_pid
union_neighbors_num
xor_neighbors_num


In [228]:
# --------------------------- new
train_pair_df["confidence"] = train_pair_df["confidence_xgb"]
test_pair_df["confidence"] = test_pair_df["confidence_xgb"]
# ---------------------------


# テストと同程度のpositive:negativeの割合になるように調整しておく
import random
train_pair_only = train_pair_df[
    train_pair_df["posting_id"].str.contains("train_", na=False) & 
    train_pair_df["candidate_posting_id"].str.contains("train_", na=False)
]
train_cpid_test = train_pair_df[
    train_pair_df["posting_id"].str.contains("train_", na=False) & 
    train_pair_df["candidate_posting_id"].str.contains("test_", na=False)
].sample(frac=0.5)
train_pair_only = pd.concat([
    train_pair_only,
    train_cpid_test
])

best, threshold = shopee.optimization.find_optimal_threshold(
    df=train_df,
    pair_df=train_pair_only
)
print("optimial threshold: %.4f" % threshold)
train_pair_df["prediction"] = (train_pair_df["confidence"] > threshold).astype(int)
shopee.metrics.show_score(train_df, train_pair_df)

optimial threshold: 0.4843
positive ratio: 0.1097
total: 201752
TP: 129447
FP: 31001
FN: 72305
TN: 1382443
F1: 0.7849


# catboost


In [247]:
%%time
train_pair_df, test_pair_df = shopee.training.train(
    train_df,
    test_df,
    train_pair_df, test_pair_df,
    weight_rate=config.weight_rate,
    num_kfolds=2,#config.num_kfolds,
    feature_names=config.feature_names,
    model_name="cat"
)

CPU times: user 30min 29s, sys: 24.6 s, total: 30min 53s
Wall time: 8min 43s


In [248]:
# --------------------------- new
train_pair_df["confidence"] = train_pair_df["confidence_cat"]
test_pair_df["confidence"] = test_pair_df["confidence_cat"]
# ---------------------------


# テストと同程度のpositive:negativeの割合になるように調整しておく
import random
train_pair_only = train_pair_df[
    train_pair_df["posting_id"].str.contains("train_", na=False) & 
    train_pair_df["candidate_posting_id"].str.contains("train_", na=False)
]
train_cpid_test = train_pair_df[
    train_pair_df["posting_id"].str.contains("train_", na=False) & 
    train_pair_df["candidate_posting_id"].str.contains("test_", na=False)
].sample(frac=0.5)
train_pair_only = pd.concat([
    train_pair_only,
    train_cpid_test
])

best, threshold = shopee.optimization.find_optimal_threshold(
    df=train_df,
    pair_df=train_pair_only
)
print("optimial threshold: %.4f" % threshold)
train_pair_df["prediction"] = (train_pair_df["confidence"] > threshold).astype(int)
shopee.metrics.show_score(train_df, train_pair_df)

optimial threshold: 0.5908
positive ratio: 0.1097
total: 201752
TP: 130278
FP: 31477
FN: 71474
TN: 1381967
F1: 0.7875


fold 2
Wall time: 8min 43s
```
optimial threshold: 0.5908
positive ratio: 0.1097
total: 201752
TP: 130278
FP: 31477
FN: 71474
TN: 1381967
F1: 0.7875
```

fold 5
Wall time: 42min 20s
```
optimial threshold: 0.4687
positive ratio: 0.1097
total: 201752
TP: 132268
FP: 32098
FN: 69484
TN: 1381346
F1: 0.7903
```

In [244]:
%load_ext autoreload
%autoreload
import shopee

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# neuralnet

In [230]:
# 何もしない場合は欠損値は自動で平均埋め
train_pair_df, test_pair_df = shopee.training.train(
    train_df,
    test_df,
    train_pair_df, test_pair_df,
    weight_rate=config.weight_rate,
    num_kfolds=config.num_kfolds,
    feature_names=config.feature_names,
    model_name="nn",
    nn_params = {
                "epochs": 20,
                "bs": 512,
                "hidden_size": 100,
                "dropout_rate": 0.05,
                "layer_num": 3,
                "scheduler": "CosineAnnealingWarmRestarts",
                "T_0": 5,
                "lr": 3e-2,
                "min_lr": 5e-4,
                "momentum": 0.9,
                "early_stopping_step": 5,
                "early_stop": True,
                "seed": 41,
                "num_class": 2
    }
)

X_train: 1041059 X_valid: 90490  X_allvalid: 318829
FOLD:0, EPOCH: 0, train_loss: 0.1338, valid_loss: 0.2516
FOLD:0, EPOCH: 1, train_loss: 0.1285, valid_loss: 0.2454
FOLD:0, EPOCH: 2, train_loss: 0.1272, valid_loss: 0.2172
FOLD:0, EPOCH: 3, train_loss: 0.1263, valid_loss: 0.2444
FOLD:0, EPOCH: 4, train_loss: 0.1254, valid_loss: 0.2312
FOLD:0, EPOCH: 5, train_loss: 0.1247, valid_loss: 0.2527
FOLD:0, EPOCH: 6, train_loss: 0.1240, valid_loss: 0.2376
FOLD:0, EPOCH: 7, train_loss: 0.1233, valid_loss: 0.2501
X_train: 1044482 X_valid: 91185  X_allvalid: 317609
FOLD:1, EPOCH: 0, train_loss: 0.1335, valid_loss: 0.2959
FOLD:1, EPOCH: 1, train_loss: 0.1288, valid_loss: 0.3289
FOLD:1, EPOCH: 2, train_loss: 0.1269, valid_loss: 0.2749
FOLD:1, EPOCH: 3, train_loss: 0.1256, valid_loss: 0.2515
FOLD:1, EPOCH: 4, train_loss: 0.1251, valid_loss: 0.3329
FOLD:1, EPOCH: 5, train_loss: 0.1243, valid_loss: 0.3080
FOLD:1, EPOCH: 6, train_loss: 0.1240, valid_loss: 0.2522
FOLD:1, EPOCH: 7, train_loss: 0.1234, val

KeyboardInterrupt: ignored

In [None]:
train_pair_df["confidence"] = train_pair_df["confidence_nn"]
test_pair_df["confidence"] = test_pair_df["confidence_nn"]


# テストと同程度のpositive:negativeの割合になるように調整しておく
import random
train_pair_only = train_pair_df[
    train_pair_df["posting_id"].str.contains("train_", na=False) & 
    train_pair_df["candidate_posting_id"].str.contains("train_", na=False)
]
train_cpid_test = train_pair_df[
    train_pair_df["posting_id"].str.contains("train_", na=False) & 
    train_pair_df["candidate_posting_id"].str.contains("test_", na=False)
].sample(frac=0.5)
train_pair_only = pd.concat([
    train_pair_only,
    train_cpid_test
])

best, threshold = shopee.optimization.find_optimal_threshold(
    df=train_df,
    pair_df=train_pair_only
)
print("optimial threshold: %.4f" % threshold)
train_pair_df["prediction"] = (train_pair_df["confidence"] > threshold).astype(int)
shopee.metrics.show_score(train_df, train_pair_df)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.histplot(data=train_pair_df, x="confidence_nn", hue="matched",bins=40, log_scale=(False, False))


## emsemble

In [None]:
rates = {
    "xgb": 0.5,
    "nn": 0.5
}
train_pair_df["confidence"] = (train_pair_df["confidence_nn"]*rates["nn"] + train_pair_df["confidence_xgb"]*rates["xgb"])
test_pair_df["confidence"] = (test_pair_df["confidence_nn"]*rates["nn"] + test_pair_df["confidence_xgb"]*rates["xgb"])

import random
train_pair_only = train_pair_df[
    train_pair_df["posting_id"].str.contains("train_", na=False) & 
    train_pair_df["candidate_posting_id"].str.contains("train_", na=False)
]
train_cpid_test = train_pair_df[
    train_pair_df["posting_id"].str.contains("train_", na=False) & 
    train_pair_df["candidate_posting_id"].str.contains("test_", na=False)
].sample(frac=0.5)
train_pair_only = pd.concat([
    train_pair_only,
    train_cpid_test
])

best, threshold = shopee.optimization.find_optimal_threshold(
    df=train_df,
    pair_df=train_pair_only # train_pair_df
)
print("optimial threshold: %.4f" % threshold)
train_pair_df["prediction"] = (train_pair_df["confidence"] > threshold).astype(int)
shopee.metrics.show_score(train_df, train_pair_df)

In [None]:
test_pair_df["prediction"] = (test_pair_df["confidence"] > threshold).astype(int)
gdf = test_pair_df[test_pair_df["prediction"] == 1].groupby(
    "posting_id",
    as_index=False
)["candidate_posting_id"].apply(lambda _: " ".join(_))
submission_df = pd.read_csv("../input/shopee-product-matching/sample_submission.csv", usecols=["posting_id"])
submission_df = pd.merge(submission_df, gdf, on="posting_id", how="left")
submission_df = submission_df.rename(columns={
    "candidate_posting_id": "matches"
})
submission_df[["posting_id", "matches"]].to_csv("./submission.csv", index=False)

In [None]:
# 提出形式の確認
!head ./submission.csv