In [1]:
import os
import joblib
from datetime import datetime
from typing import Dict, Tuple, Any
from tqdm import tqdm
import pickle
from collections import defaultdict

import math
import numpy as np
import pandas as pd

from scipy.special import softmax
from sklearn.model_selection import train_test_split, StratifiedKFold

import cv2
import albumentations
from torch.utils.data import Dataset

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.autograd import Variable
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import OneCycleLR, ReduceLROnPlateau

import lightgbm as lgb

import timm

import psutil
def get_used_memory():
    return psutil.Process(os.getpid()).memory_info().vms/1024**3
def get_used_memory_txt():
    return 'Used memory: {:.2f}'.format(get_used_memory())
initial_used_memory=get_used_memory()
print(get_used_memory_txt())

Used memory: 4.27


In [2]:
df = pd.read_csv('../input/train.csv')
df_full = pd.read_csv('../input/train_full.csv')
df_lm = df_full[df_full['landmark_id'].isin(df['landmark_id'].unique())].reset_index(drop=True)
df_lm.drop(columns='url', inplace=True)
lm_img2embd_map = {img_id: i for i, img_id in enumerate(df_lm['id'])}
lm_id2class_map = {id_: i for i, id_ in enumerate(sorted(df_lm['landmark_id'].unique()))}
df_lm['class'] = df_lm['landmark_id'].map(lambda x: lm_id2class_map[x])
lm_img2cls_map = defaultdict(lambda: -1)
lm_img2cls_map.update({img: c for img, c in zip(df_lm['id'], df_lm['class'])})

In [3]:
df_nlm = pd.read_csv('../input/recognition_solution_v2.1.csv')
df_nlm = df_nlm[df_nlm['landmarks'].isna()].reset_index(drop=True)
nlm_img2embd_map = {img_id: i for i, img_id in enumerate(df_nlm['id'])}
nlm_img2cls_map = defaultdict(lambda: -1)
nlm_img2cls_map.update({img: 81313 for img in df_nlm['id']})

In [4]:
del df, df_full, df_lm, df_nlm

In [5]:
df_train = pd.read_csv('./final/df_vector_discriminator.csv')

In [7]:
df_train['class'] = [max(lm_img2cls_map[id0], lm_img2cls_map[id1], nlm_img2cls_map[id0], nlm_img2cls_map[id1]) for id0, id1 in zip(df_train['img_id'], df_train['img_id_crossed'])]

In [14]:
from sklearn.model_selection import StratifiedKFold

In [15]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

df_train['fold'] = -1

for i, (_, vld_idx) in enumerate(skf.split(df_train.index, df_train['class'])):
    df_train['fold'].iloc[vld_idx] = i

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [22]:
df_train.to_csv('./final/VD_dataframe_withFolds.csv')

In [22]:
embedding_std = np.load('./final/3.2m_train_landmarks_ensembled_f16_pca.npy')
embedding_dba = np.load('./final/3.2m_train_landmarks_ensembled_f16_pca_dba.npy')
embedding_npm = np.load('./final/115k_non_landmarks_ensembled_f16_pca.npy')

In [21]:
lm_img_set = set(df_lm['id'])

In [23]:
for imgid in df_train.loc[df_train['is_nl'] == 1, 'img_id']:
    assert imgid not in lm_img_set

In [3]:
FOLD = 0

trn_idx = df_train.loc[df_train['fold'] != FOLD].index.values
vld_idx = df_train.loc[df_train['fold'] == FOLD].index.values

trn_array = np.zeros((len(trn_idx), 512*4), dtype=np.float32)
trn_label = df_train['target'].iloc[trn_idx].values

vld_array = np.zeros((len(vld_idx), 512*4), dtype=np.float32)
vld_label = df_train['target'].iloc[vld_idx].values

In [4]:
for i, idx in tqdm(enumerate(trn_idx), total=len(trn_idx)):
    embd0 = embeddings[imgId2embedding[df_train['img_id'].iloc[idx]]]
    embd1 = embeddings[imgId2embedding[df_train['img_id_crossed'].iloc[idx]]]
    trn_array[i, :512] = embd0
    trn_array[i, 512:1024] = embd1
    trn_array[i, 1024:1536] = np.nan_to_num(embd1 - embd0)
    embd0[embd0 == 0] = .001
    embd1[embd1 == 0] = .001
    trn_array[i, 1536:] = np.nan_to_num(embd1/embd0)
    
for i, idx in tqdm(enumerate(vld_idx), total=len(vld_idx)):
    embd0 = embeddings[imgId2embedding[df_train['img_id'].iloc[idx]]]
    embd1 = embeddings[imgId2embedding[df_train['img_id_crossed'].iloc[idx]]]
    vld_array[i, :512] = embd0
    vld_array[i, 512:1024] = embd1
    vld_array[i, 1024:1536] = np.nan_to_num(embd1 - embd0)
    embd0[embd0 == 0] = .001
    embd1[embd1 == 0] = .001
    vld_array[i, 1536:] = np.nan_to_num(embd1/embd0)

  trn_array[i, 1536:] = np.nan_to_num(embd1/embd0)
100%|██████████| 7483444/7483444 [07:44<00:00, 16118.19it/s]
  vld_array[i, 1536:] = np.nan_to_num(embd1/embd0)
100%|██████████| 1870861/1870861 [01:56<00:00, 16073.70it/s]


In [6]:
model = lgb.LGBMClassifier(objective='binary', learning_rate=0.01, n_estimators=10000, n_jobs=24)

In [7]:
model.fit(
    X=trn_array,
    y=trn_label,
    eval_set=(vld_array, vld_label),
    eval_metric=['auc', 'binary_logloss'],
    early_stopping_rounds=100,
    verbose=10
)

Training until validation scores don't improve for 100 rounds
[10]	valid_0's auc: 0.759652	valid_0's binary_logloss: 0.686373
[20]	valid_0's auc: 0.789391	valid_0's binary_logloss: 0.679893
[30]	valid_0's auc: 0.805153	valid_0's binary_logloss: 0.673624
[40]	valid_0's auc: 0.81411	valid_0's binary_logloss: 0.667548
[50]	valid_0's auc: 0.820215	valid_0's binary_logloss: 0.661676
[60]	valid_0's auc: 0.824923	valid_0's binary_logloss: 0.656021
[70]	valid_0's auc: 0.828429	valid_0's binary_logloss: 0.650575
[80]	valid_0's auc: 0.831105	valid_0's binary_logloss: 0.64558
[90]	valid_0's auc: 0.8332	valid_0's binary_logloss: 0.640597
[100]	valid_0's auc: 0.834825	valid_0's binary_logloss: 0.635691
[110]	valid_0's auc: 0.836332	valid_0's binary_logloss: 0.631043
[120]	valid_0's auc: 0.83773	valid_0's binary_logloss: 0.62649
[130]	valid_0's auc: 0.838896	valid_0's binary_logloss: 0.622104
[140]	valid_0's auc: 0.840004	valid_0's binary_logloss: 0.617847
[150]	valid_0's auc: 0.840899	valid_0's bin

LGBMClassifier(learning_rate=0.01, n_estimators=10000, n_jobs=24,
               objective='binary')

In [15]:
joblib.dump(model, './model_checkpoints/lgbm_discriminator.lgb')

['./model_checkpoints/lgbm_discriminator.lgb']