In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import gc
import torch
from PIL import Image
from IPython.display import Image,display
torch.manual_seed(0)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True

In [2]:
train=pd.read_csv(r'../input/shopee-product-matching/train.csv')
train.shape


(34250, 5)

In [3]:
train_image='../input/shopee-product-matching/train_images'
WORK_DIR = '../input/shopee-product-matching'

In [4]:
import re
def preprocess(description):
  # Actually not required as what we have is titles which usually doesn't contain words that we use for general communication.
    description=description.lower()
    description=re.sub('[-\n\t]+',' ',description)
    description= re.sub(r"won\'t", "will not",description)
    description=re.sub(r"can\'t", "can not",description)
    description=re.sub(r"n\'t", " not",description)
    description=re.sub(r"\'re", " are",description)
    description=re.sub(r"\'s", " is",description)
    description=re.sub(r"\'d", " would",description)
    description=re.sub(r"\'ll", " will",description)
    description=re.sub(r"\'t", " not",description)
    description=re.sub(r"\'ve", " have",description)
    description=re.sub(r"\'m", " am",description)
    description=re.sub('[^a-z0-9]+',' ',description)
    description=re.sub('\s+',' ',description)
    return description.strip()

In [5]:
clensed_train=[preprocess(title) for title in tqdm(train.title.values)]

100%|██████████| 34250/34250 [00:00<00:00, 34921.96it/s]


In [6]:
%%time
tfidf=TfidfVectorizer()
embedded_train=tfidf.fit_transform(clensed_train).toarray()
embedded_train.shape

CPU times: user 804 ms, sys: 924 ms, total: 1.73 s
Wall time: 1.75 s


(34250, 25023)

In [7]:
embedded_train=torch.from_numpy(embedded_train)
embedded_train=embedded_train.cuda()

In [8]:
%%time
matches=[]
chunksize=500
chunks=len(train)//chunksize if len(train)%chunksize==0 else (len(train)//chunksize)+1
chunks
for chunk in tqdm(range(chunks)):
    start=chunk*chunksize
    end=min(len(train),start+chunksize)
    cossim=torch.matmul(embedded_train,embedded_train[start:end].T).T
    cossim=cossim.data.cpu().numpy()
    for per_posting in cossim:
        indices=np.where(per_posting>=0.65)[0]
        match=train.iloc[indices].posting_id.values
        if len(match.tolist())>50:
            ind=np.where(per_posting>=sorted(per_posting)[-50])[0]
            match=train.iloc[ind].posting_id.values
        matches.append(' '.join(match.tolist()))

100%|██████████| 69/69 [00:47<00:00,  1.44it/s]

CPU times: user 42.5 s, sys: 4.72 s, total: 47.2 s
Wall time: 47.8 s





In [9]:
# create a function that takes title as input 
# returns the names of the matches 
# returns own image and the matching images 

In [10]:
from IPython.display import Image,display
def txt_return_img (df):
    i=input("enter item name:")
    p=next(iter(df[df['title']==i].index))
    image_dis(df['image'],p)
    return df.loc[df['title']==i]

def image_dis(dfs,o):
    return display(Image(filename='../input/shopee-product-matching/train_images/'+dfs[o],width=480,height=360))
#df.loc[df['title']].str.contains(i,regex=True,case=False)]

In [11]:
train['matches']=matches
train.head(10)

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,train_129225211 train_2278313361
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,train_3386243561
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,train_2288590299 train_3803689425
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,train_2406599165 train_3576714541 train_150810...
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,train_3369186413
5,train_2464356923,0013e7355ffc5ff8fb1ccad3e42d92fe.jpg,bbd097a7870f4a50,CELANA WANITA (BB 45-84 KG)Harem wanita (bisa...,2660605217,train_2464356923
6,train_1802986387,00144a49c56599d45354a1c28104c039.jpg,f815c9bb833ab4c8,Jubah anak size 1-12 thn,1835033137,train_1802986387 train_1396161074
7,train_1806152124,0014f61389cbaa687a58e38a97b6383d.jpg,eea7e1c0c04da33d,KULOT PLISKET SALUR /CANDY PLISKET /WISH KULOT...,1565741687,train_1806152124
8,train_86570404,0019a3c6755a194cb2e2c12bfc63972e.jpg,ea9af4f483249972,"[LOGU] Tempelan kulkas magnet angka, tempelan ...",2359912463,train_86570404 train_2269068443 train_115157077
9,train_831680791,001be52b2beec40ddc1d2d7fc7a68f08.jpg,e1ce953d1a70618f,BIG SALE SEPATU PANTOFEL KULIT KEREN KERJA KAN...,2630990665,train_831680791


In [12]:
submission=train[['posting_id','matches']]
submission

Unnamed: 0,posting_id,matches
0,train_129225211,train_129225211 train_2278313361
1,train_3386243561,train_3386243561
2,train_2288590299,train_2288590299 train_3803689425
3,train_2406599165,train_2406599165 train_3576714541 train_150810...
4,train_3369186413,train_3369186413
...,...,...
34245,train_4028265689,train_4028265689
34246,train_769054909,train_1463059254 train_2530102819 train_769054909
34247,train_614977732,train_512157627 train_614977732
34248,train_3630949769,train_3419392575 train_1431563868 train_363094...


In [13]:
del embedded_train,train
gc.collect()

66

In [14]:
submission=pd.read_csv(r'../input/shopee-product-matching/sample_submission.csv')

In [15]:
submission.to_csv('submission.csv',index=False)