In [None]:
# The main idea of this notebook is to expand set of same point using transitivity

In [1]:
import pandas as pd

In [2]:
import numpy as np
import gc
gc.collect()

101

In [None]:
DIR = '/kaggle/working/train_data_githubbed'

In [3]:
train_df = pd.read_parquet(f'{DIR}/valid.pqt')
val_df = pd.read_parquet(f'{DIR}/train.pqt')

In [4]:
import catboost as cb

In [5]:
train_df.label.mean()

0.045329371761010095

In [6]:
pos_examples = train_df.label.sum()
pos_train = train_df[train_df.label == 1]
train_df_neg_sample = train_df[train_df.label==0].sample(pos_examples)
val_df_neg_sample = val_df[val_df.label==0].sample(pos_examples)

train_df_new = pd.concat([pos_train, train_df_neg_sample])
val_df_new = pd.concat([val_df[val_df.label == 1], val_df_neg_sample])

In [7]:
train_df.columns[1:]

Index(['kneighbors', 'found', 'kdist_name_0_1', 'kdist_name_4',
       'kneighbors_name_0_1', 'kneighbors_name_4', 'found_name_0_1',
       'found_name_4', 'kdist_country', 'kneighbors_country',
       ...
       'rank_name_dist', 'rank_name_dist_cos', 'rank_simple_sim',
       'rank_address_gesh', 'rank_address_sim', 'rank_categories_sim',
       'rank_name_sim', 'id', 'match_id', 'label'],
      dtype='object', length=103)

In [8]:
train_cols = [col for col in train_df.columns if col not in ('id', 'match_id', 'label')]
cat_features = ['country']

In [9]:
train_pool = cb.Pool(
    train_df_new[train_cols],
    label=train_df_new.label.values,
    feature_names=train_cols,
    cat_features=cat_features,
    #group_id=train_df['id']
)
             
val_pool = cb.Pool(
    val_df_new[train_cols], 
    label=val_df_new.label.values, 
    cat_features=cat_features,
    feature_names=train_cols,
    #group_id=val_df['id']
)

In [10]:
clf_train = cb.CatBoostClassifier().load_model('/kaggle/working/cb_model_vecs_and_cats')
clf_val = cb.CatBoostClassifier().load_model('/kaggle/working/cb_model_vecs_and_cats_inv')

In [11]:
clf_train = cb.CatBoostClassifier().load_model('/kaggle/working/cb_model_vecs_and_cats_inv')
clf_val = cb.CatBoostClassifier().load_model('/kaggle/working/cb_model_vecs_and_cats')

In [12]:
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
train_df['prediction_proba'] = clf_val.predict_proba(train_df[train_cols])[:, 1]
val_df['prediction_proba'] = clf_train.predict_proba(val_df[train_cols])[:, 1]

In [13]:
train_tag, val_tag = 'valid', 'train'#'train', 'valid'
DIR = '/kaggle/input/foursquare-location-matching/train_data_smart_vecs'
id2ids_train = np.load(f'{DIR}/id2ids_{train_tag}.npy', allow_pickle=True).item()
id2ids_val = np.load(f'{DIR}/id2ids_{val_tag}.npy', allow_pickle=True).item()
print(set(id2ids_train) & set(id2ids_val))
id2ids = {k: v for dct in [id2ids_train, id2ids_val] for k, v in dct.items()}

set()


In [14]:
def evaluate(val_df):
    preds_all = val_df.groupby('id').agg(
        {
            'match_id': set, 
            #'country': 'first'
        }
    )
    preds_all = preds_all.rename({'match_id': 'all_match_ids'}, axis=1)
    preds_all.head()
    preds = val_df[val_df.prediction == 1].groupby('id').agg({'match_id': set})
    preds['y_true'] = preds.index.map(id2ids.get)
    preds_agg = preds.join(preds_all)
    preds_agg['findable_ids'] = preds_agg.apply(lambda x: (x.all_match_ids ) & x.y_true, axis=1)
    preds_agg['possible_iou'] = preds_agg.apply(lambda x: len(x.findable_ids) / len(x.y_true), axis=1)
    preds_agg['model_iou'] = preds_agg.apply(lambda x: len(x.match_id & x.findable_ids) / len(x.findable_ids), axis=1)
    preds_agg['iou'] = preds_agg.apply(lambda x: len(x.match_id & x.y_true) / len(x.match_id | x.y_true), axis=1)
    preds_agg['maxtruelen'] = preds_agg.y_true.apply(len)
    preds_agg['found_len'] = preds_agg.match_id.apply(len)
    print(preds_agg[['iou', 'model_iou', 'possible_iou']].mean())

In [15]:
import gc
gc.collect()

21

In [16]:
train_df['prediction'] = train_df.prediction_proba > 0.53
val_df['prediction'] = val_df.prediction_proba > 0.53


In [17]:
train_df.shape

(26339037, 106)

In [18]:
evaluate(train_df)

iou             0.873009
model_iou       0.921346
possible_iou    0.979426
dtype: float64


In [19]:
evaluate(val_df)

iou             0.873612
model_iou       0.921981
possible_iou    0.979461
dtype: float64


In [20]:
from tqdm.auto import tqdm
def evolve(curr_gid, base_gid):
    new_gid = dict()
    for k, v in tqdm(curr_gid.items()):
        new_gid[k] = set(v)
        for i in v:
            new_gid[k] |= base_gid.get(i, set())
    return new_gid

def gen_pred_df(pred_set):
    pred_df = pd.DataFrame(pd.Series(pred_set).explode()).reset_index()
    pred_df.columns = ['id', 'match_id']
    return pred_df

In [21]:
spdf = val_df[['id', 'match_id', 'prediction', 'label', 'prediction_proba']]

preds_df = pd.concat([spdf, spdf.rename(columns={'id': 'match_id', 'match_id': 'id'})]).drop_duplicates(['id', 'match_id'])

In [22]:
spdf.to_parquet(f'spdf_train.pqt')

In [23]:
gc.collect()
preds_df = spdf[spdf.prediction_proba > 0.6]
pred_set_c = preds_df.groupby('id')['match_id'].apply(set).to_dict()
pred_set = preds_df.groupby('id')['match_id'].apply(set).to_dict()
sure_pred_set = spdf[spdf.prediction_proba > 0.7].groupby('id')['match_id'].apply(set).to_dict()
NUM_EVOLUTIONS = 3
for ev in range(NUM_EVOLUTIONS):
    pred_set = evolve(pred_set, sure_pred_set)

for k, v in pred_set_c.items():
    if k in pred_set:
        pred_set[k] -= v
del pred_set_c

  0%|          | 0/569401 [00:00<?, ?it/s]

  0%|          | 0/569401 [00:00<?, ?it/s]

  0%|          | 0/569401 [00:00<?, ?it/s]

In [24]:
pred_df = gen_pred_df(pred_set).dropna()
pred_df['prediction'] = True
pred_df['label'] = pred_df.apply(lambda x: x.match_id in id2ids.get(x.id), axis=1).astype(np.int8)

In [25]:
new_pred_df = pd.concat([
    spdf[['id', 'match_id', 'prediction', 'label']], 
    pred_df
])

In [26]:
for i in [0.47, 0.53, 0.6, 0.65]:
    evaluate(spdf[spdf.prediction_proba > i])

iou             0.873612
model_iou       0.990236
possible_iou    0.915325
dtype: float64
iou             0.873612
model_iou       1.000000
possible_iou    0.907816
dtype: float64
iou             0.870956
model_iou       1.000000
possible_iou    0.897794
dtype: float64
iou             0.867601
model_iou       1.000000
possible_iou    0.889906
dtype: float64


In [27]:
evaluate(preds_df)

iou             0.870956
model_iou       1.000000
possible_iou    0.897794
dtype: float64


In [28]:
evaluate(new_pred_df)

iou             0.878358
model_iou       0.931564
possible_iou    0.983045
dtype: float64
