https://www.kaggle.com/cdeotte/part-2-rapids-tfidfvectorizer-cv-0-700

# setup

In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


HOME = '/data/git/shopee-product-matching'
pdata = f'{HOME}/data/shopee-product-matching'

# metric

In [2]:
# def getMetric(col):
#     def f1score(row):
#         n = len( np.intersect1d(row.target,row[col]) )
#         return 2*n / (len(row.target)+len(row[col]))
#     return f1score

def getf1(x,y):
    n = len(np.intersect1d(x,y))
    return 2*n / (len(x)+len(y))

def getf1s(xs,ys):
    return (getf1(x,y) for x,y in zip(xs,ys))

def meanf1(xs,ys):
    return np.mean(list(getf1s(xs, ys)))

# data and target

In [3]:
df = pd.read_csv(f'{pdata}/train.csv')

pids = df.posting_id.values

display(df.head(2))

grp2ids = df.groupby('label_group').posting_id.agg('unique').to_dict()
targets = df.label_group.map(grp2ids)

targets[:2]

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045


0     [train_129225211, train_2278313361]
1    [train_3386243561, train_3423213080]
Name: label_group, dtype: object

# baselines

## self baseline

In [4]:
preds = df.posting_id.apply(lambda x: [x])
print(preds[0], targets[0])

meanf1(preds,targets)

['train_129225211'] ['train_129225211' 'train_2278313361']


0.4608481791365001

## same hash baseline

In [5]:
hsh2ids = df.groupby('image_phash').posting_id.agg('unique').to_dict()
preds = df.image_phash.map(hsh2ids)
print(preds[0], targets[0])

meanf1(preds,targets)

['train_129225211'] ['train_129225211' 'train_2278313361']


0.5530933399168149

# feature sims

In [9]:
def get_preds_by_thrsh(sims, thrsh):
    isclose = sims>=thrsh
    preds_idx = (np.where(x)[0] for x in isclose)
    return [pids[o] for o in preds_idx]

def find_best(sims, thrshes = np.linspace(0.7,1.,num=31)):
    f1_lst = []
    preds_lst = []
    for thrsh in tqdm(thrshes):
        preds = get_preds_by_thrsh(sims, thrsh)
        preds_lst.append(preds)
        f1 = meanf1(preds,targets)
        f1_lst.append(f1)
    f1_best, thrsh_best, preds_best = sorted(zip(f1_lst, thrshes, preds_lst), reverse=True)[0]
    return f1_best, thrsh_best, preds_best

In [15]:
modal2res = {}

## image sims

In [10]:
sims = np.load(f"{HOME}/data/dev0005/vsims.npy")

In [12]:
f1_best, thrsh_best, preds_best = find_best(sims, np.linspace(0.7,1.,num=31))

100%|██████████| 31/31 [01:37<00:00,  3.15s/it]


In [13]:
f1_best, thrsh_best

(0.6661021248340279, 0.8099999999999999)

In [16]:
modal2res['v'] = f1_best, thrsh_best, preds_best

## text sims

In [17]:
sims = np.load(f"{HOME}/data/dev0005/tsims.npy")

In [20]:
f1_best, thrsh_best, preds_best = find_best(sims, np.linspace(0.9,1.,num=11))

100%|██████████| 11/11 [00:40<00:00,  3.66s/it]


In [21]:
f1_best, thrsh_best

(0.5732622373044297, 0.95)

In [22]:
modal2res['t'] = f1_best, thrsh_best, preds_best

## hash sims

In [23]:
sims = np.load(f"{HOME}/data/dev0011/hsims.npy")

In [24]:
f1_best, thrsh_best, preds_best = find_best(sims, np.linspace(0.1, 0.15,num=6))

100%|██████████| 6/6 [00:18<00:00,  3.07s/it]


In [25]:
f1_best, thrsh_best

(0.5956242937941346, 0.11)

In [26]:
modal2res['h'] = f1_best, thrsh_best, preds_best

# combine

In [28]:
modal2res.keys()

dict_keys(['v', 't', 'h'])

In [64]:
preds_cmb = []
for l in zip(*(o[-1] for o in modal2res.values())):
    preds_cmb.append(np.unique(np.concatenate(l)))

In [65]:
len(preds_cmb), preds_cmb[:3]

(34250,
 [array(['train_129225211', 'train_2278313361'], dtype=object),
  array(['train_1816968361', 'train_2120597446', 'train_3386243561',
         'train_3423213080', 'train_3805508898'], dtype=object),
  array(['train_2288590299', 'train_3803689425'], dtype=object)])

In [66]:
meanf1(preds_cmb,targets)

0.6926547530973178