In [1]:
import spacy
import pandas as pd
import numpy as np
from tqdm import tqdm
from signal import signal, SIGPIPE, SIG_DFL
import re
import json

In [2]:
import os
import sys
sys.path.append('../')

from nlp.lemmatizer import LOOKUP
from nlp.noise_words import noise_words

In [3]:
from signal import signal, SIGPIPE, SIG_DFL

In [2]:
df = pd.read_json('../../data/tokped/susu/products.json')

In [5]:
def cleaning_title(doc):
    text = [token.text if token not in LOOKUP.items()
            else LOOKUP[token]
            for token in doc
            if not token.is_stop
            and token.text not in noise_words
            and len(token.text) > 2
            ]
    if len(text) > 2:
        return ' '.join(text)

In [9]:
nlp = spacy.blank('id')
brief_cleaning = [re.sub("[^A-Za-z']+", ' ', str(title)).lower()
                      for title in df.name.values]

In [10]:
text = [cleaning_title(doc) for doc in tqdm(
        nlp.pipe(brief_cleaning, batch_size=5000, n_process=-1))]

331634it [01:04, 5174.43it/s]


In [11]:
titles = [title.split() for title in text if title is not None]
words = np.array([word for title in tqdm(titles)
                 for word in title], dtype=object)

100%|██████████| 298570/298570 [00:00<00:00, 4125207.43it/s]


In [13]:
from collections import Counter
counter = Counter(words)

In [7]:
import json
with open('keyword_counter.json', 'w') as f:
    json.dump(dict(counter), f)

NameError: name 'counter' is not defined

In [14]:
keywords = [k[0] for k in counter.most_common(20)]
keywords

['susu',
 'milk',
 'keju',
 'cream',
 'isi',
 'coklat',
 'cheese',
 'butter',
 'uht',
 'bubuk',
 'sachet',
 'gold',
 'liter',
 'almond',
 'cheddar',
 'vanila',
 'repack',
 'full',
 'madu',
 'kambing']

In [20]:
titles = np.array(titles, dtype=object)

In [40]:
def keyword_pairs(keywords, n_common=20, threshold=100):
    for k in keywords:
        # find prods which contain the keyword and find second most common
        contains_k = titles[np.array([k in name for name in titles])]
        words = [word for name in contains_k for word in name]
        k_counter = Counter(words)
        most_common = k_counter.most_common(n_common)
        for key in most_common:
            if k != key[0] and key[1] > 100:
                yield k + ' ' + key[0]

In [41]:
f = open('../../data/tokped/susu/keywords.json', 'w')
json.dump(list(keyword_pairs(keywords)), f)

In [42]:
data = list(keyword_pairs(keywords))

In [46]:
len(json.dumps([{"value": k, "name": f"query: {k}"} for k in data]))

21702

In [3]:
df_keywords_loader = pd.read_json('../../data/tokped/susu/keyword_search.jsonlines', chunksize=10000, lines=True)
df_keywords = pd.concat([dataframe for dataframe in tqdm(df_keywords_loader, total=1358952/10000)])

  full_bar = Bar(frac,
100%|██████████| 136/135.8952 [00:13<00:00,  9.92it/s]


In [4]:
df_total = pd.concat([df_keywords.drop_duplicates(subset='id'), df]).drop_duplicates(subset='id')

In [5]:
df_total['sub_category'] = df_total.apply(lambda x: x.category_breadcrumb.split('/')[-1], axis=1)

In [6]:
df_total['sub_category'] = df_total.apply(lambda x: '-'.join([split for split in re.split(r" |\-|&", x.sub_category.lower()) if split != '']), axis=1)

In [7]:
df_total['shop_name'] = df_total.apply(lambda x: re.findall(r'(?<=https:\/\/www.tokopedia.com\/)(.*)(?=\/)', x.prod_url)[0], axis=1)

In [35]:
df_total

Unnamed: 0_level_0,id,name,category_breadcrumb,prod_url,old_price,discounted_price,discount_percent,stock,image_urls,review_count,rating,sold,ref,sub_category,shop_name
shop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
11440402,1874783778,ETAWAKU PLATINUM 3pcs - susu kambing etawa,makanan-minuman/produk-susu/susu-bubuk,https://www.tokopedia.com/etawakustore-1/etawa...,,210000,0,6,[https://images.tokopedia.net/img/cache/200-sq...,1,5.0,4,device=desktop&rows=200&source=universal&sc=26...,susu-bubuk,etawakustore-1
11544205,1871554960,Susu Kambing Etawa Murni - Etawaku Platinum,makanan-minuman/produk-susu/susu-bubuk,https://www.tokopedia.com/silostore93/susu-kam...,,73000,0,87,[https://images.tokopedia.net/img/cache/200-sq...,2,5.0,20,device=desktop&rows=200&source=universal&sc=26...,susu-bubuk,silostore93
11544205,1871551276,susu kambing etawa murni - Etawaku Platinum (p...,makanan-minuman/produk-susu/susu-bubuk,https://www.tokopedia.com/silostore93/susu-kam...,,140000,0,49,[https://images.tokopedia.net/img/cache/200-sq...,1,5.0,1,device=desktop&rows=200&source=universal&sc=26...,susu-bubuk,silostore93
10778193,1870129638,Susu Kambing Etawa Gomars,makanan-minuman/produk-susu/susu-bubuk,https://www.tokopedia.com/rumahherbalamanah/su...,,17000,0,60,[https://images.tokopedia.net/img/cache/200-sq...,2,5.0,40,device=desktop&rows=200&source=universal&sc=26...,susu-bubuk,rumahherbalamanah
11547950,1864859469,susu kambing etawa HMS coletrum (1 bungkus),makanan-minuman/produk-susu/susu-bubuk,https://www.tokopedia.com/fujjolshop-1/susu-ka...,,10000,0,417,[https://images.tokopedia.net/img/cache/200-sq...,2,5.0,6,device=desktop&rows=200&source=universal&sc=26...,susu-bubuk,fujjolshop-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6391989,617562924,Susu Dancow Putih,Makanan & Minuman/Produk Susu/Susu Bubuk,https://www.tokopedia.com/sembako-mandiri/susu...,0.0,31423,0,9990,[https://ecs7-p.tokopedia.net/img/cache/250-sq...,3,5.0,10,"{'from': '1604474023', 'section': 'Pilihan lai...",susu-bubuk,sembako-mandiri
3473756,566442284,KUAS MENTEGA KUAS ROTI SILIKON CAKE BARBEQUE 1...,Makanan & Minuman/Produk Susu/Mentega & Butter,https://www.tokopedia.com/wjcol/kuas-mentega-k...,0.0,2600,0,999995,[https://ecs7-p.tokopedia.net/img/cache/250-sq...,1,5.0,4,"{'from': '281335192', 'section': 'Produk spons...",mentega-butter,wjcol
179367,5224388,Amino X - 70serving,Makanan & Minuman/Produk Susu/Susu Bubuk,https://www.tokopedia.com/suplemen-fitness/ami...,0.0,1120000,0,999999,[https://ecs7-p.tokopedia.net/img/cache/250-sq...,1,3.0,0,"{'from': '5224406', 'section': 'Produk sponsor...",susu-bubuk,suplemen-fitness
4863938,373378032,alat pembusa susu,Makanan & Minuman/Produk Susu/Susu Bubuk,https://www.tokopedia.com/lapakempatsatu/alat-...,0.0,50000,0,999999,[https://ecs7-p.tokopedia.net/img/cache/250-sq...,6,5.0,16,"{'from': '1022486913', 'section': 'Produk spon...",susu-bubuk,lapakempatsatu


In [17]:
df_total = df_total.rename(columns={'shop': 'shop_id'})

In [10]:
# find non produk-susu products
df_total[df_total.apply(lambda x: len(x.category_breadcrumb.split('/')) != 3 or (x.category_breadcrumb.split('/')[-2] != 'Produk Susu' and x.category_breadcrumb.split('/')[-2] != 'produk-susu'), axis=1)]


Unnamed: 0,id,name,category_breadcrumb,prod_url,old_price,discounted_price,discount_percent,stock,shop,image_urls,review_count,rating,sold,ref,sub_category,shop_name


In [13]:
df_total = df_total.reset_index(drop=True)

In [19]:
df_total.to_json('../../data/tokped/susu/products_final.json')

In [63]:
# find sellers
unique_shop_id = df_total.index.unique()

In [29]:
df_total = df_total.set_index('shop_id')

In [57]:
df_total.loc[unique_shop_id[0]].iloc[0].shop_name

'etawakustore-1'

In [70]:
unique_shop_name = [df_total.loc[id].iloc[0].shop_name for id in tqdm(unique_shop_id)]




  0%|          | 0/16974 [00:00<?, ?it/s][A[A[A


  1%|          | 210/16974 [00:00<00:08, 2094.31it/s][A[A[A


  3%|▎         | 477/16974 [00:00<00:06, 2424.64it/s][A[A[A


  4%|▍         | 754/16974 [00:00<00:06, 2579.13it/s][A[A[A


  6%|▋         | 1083/16974 [00:00<00:05, 2857.11it/s][A[A[A


  8%|▊         | 1417/16974 [00:00<00:05, 3029.56it/s][A[A[A


 10%|█         | 1760/16974 [00:00<00:04, 3165.13it/s][A[A[A


 12%|█▏        | 2116/16974 [00:00<00:04, 3293.33it/s][A[A[A


 15%|█▍        | 2493/16974 [00:00<00:04, 3443.97it/s][A[A[A


 17%|█▋        | 2869/16974 [00:00<00:03, 3542.53it/s][A[A[A


 19%|█▉        | 3224/16974 [00:01<00:03, 3530.98it/s][A[A[A


 21%|██        | 3578/16974 [00:01<00:03, 3425.83it/s][A[A[A


 23%|██▎       | 3958/16974 [00:01<00:03, 3534.04it/s][A[A[A


 26%|██▌       | 4359/16974 [00:01<00:03, 3673.55it/s][A[A[A


 28%|██▊       | 4751/16974 [00:01<00:03, 3746.23it/s][A[A[A


 30%|███       | 5127/1

In [86]:
shop_df = df_total.copy()
shop_df

In [87]:
shop_df.columns

Index(['id', 'name', 'category_breadcrumb', 'prod_url', 'old_price',
       'discounted_price', 'discount_percent', 'stock', 'image_urls',
       'review_count', 'rating', 'sold', 'ref', 'sub_category', 'shop_name'],
      dtype='object')

In [89]:
shop_df = shop_df.drop(columns=['id', 'name', 'category_breadcrumb', 'prod_url', 'old_price',
       'discounted_price', 'discount_percent', 'stock', 'image_urls',
       'review_count', 'rating', 'sold', 'ref', 'sub_category'])

In [91]:
shop_df = shop_df.drop_duplicates()

In [92]:
shop_df.to_json("../../data/tokped/susu/sellers.json")

In [94]:
df_total.loc[11440402]

Unnamed: 0_level_0,id,name,category_breadcrumb,prod_url,old_price,discounted_price,discount_percent,stock,image_urls,review_count,rating,sold,ref,sub_category,shop_name
shop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
11440402,1874783778,ETAWAKU PLATINUM 3pcs - susu kambing etawa,makanan-minuman/produk-susu/susu-bubuk,https://www.tokopedia.com/etawakustore-1/etawa...,,210000,0,6,[https://images.tokopedia.net/img/cache/200-sq...,1,5.0,4,device=desktop&rows=200&source=universal&sc=26...,susu-bubuk,etawakustore-1
11440402,1827694242,etawaku platinum 2box - susu kambing murni ber...,makanan-minuman/produk-susu/susu-bubuk,https://www.tokopedia.com/etawakustore-1/etawa...,,140000,0,44,[https://images.tokopedia.net/img/cache/200-sq...,4,5.0,22,device=desktop&rows=200&source=universal&sc=26...,susu-bubuk,etawakustore-1
11440402,1827692240,etawaku platinum - susu kambing murni bergizi,makanan-minuman/produk-susu/susu-bubuk,https://www.tokopedia.com/etawakustore-1/etawa...,,75000,0,95,[https://images.tokopedia.net/img/cache/200-sq...,1,5.0,5,device=desktop&rows=200&source=universal&sc=26...,susu-bubuk,etawakustore-1
11440402,1873500436,ETAWAKU PLATINUM 5 box ready stock,Makanan & Minuman/Produk Susu/Susu Bubuk,https://www.tokopedia.com/etawakustore-1/etawa...,0.0,350000,0,9,[https://ecs7-p.tokopedia.net/img/cache/250-sq...,0,0.0,1,"{'from': '1874783778', 'section': 'Lainnya di ...",susu-bubuk,etawakustore-1
11440402,1851165091,Susu daymilk - susu kambing etawa - Vanila,Makanan & Minuman/Produk Susu/Susu Bubuk,https://www.tokopedia.com/etawakustore-1/susu-...,0.0,25000,0,1,[https://ecs7-p.tokopedia.net/img/cache/250-sq...,0,0.0,0,"{'from': '1874783778', 'section': 'Lainnya di ...",susu-bubuk,etawakustore-1
11440402,1874802065,ETAWAKU PLATIKUM 10box - susu kambing etawa murni,Makanan & Minuman/Produk Susu/Susu segar & Pas...,https://www.tokopedia.com/etawakustore-1/etawa...,0.0,660000,0,10,[https://ecs7-p.tokopedia.net/img/cache/250-sq...,0,0.0,0,"{'from': '1874783778', 'section': 'Lainnya di ...",susu-segar-pasteurisasi,etawakustore-1


In [93]:
shop_df

Unnamed: 0_level_0,shop_name
shop_id,Unnamed: 1_level_1
11440402,etawakustore-1
11544205,silostore93
10778193,rumahherbalamanah
11547950,fujjolshop-1
11549673,mamafaizstore
...,...
4179132,ryotakise
8859878,tokoadekaka-1
3769682,fadlinabawi910
3385766,decko-hulu


In [43]:
%time df_total.loc[11440402].iloc[0]

CPU times: user 1.6 ms, sys: 851 µs, total: 2.45 ms
Wall time: 1.54 ms


id                                                            1874783778
name                          ETAWAKU PLATINUM 3pcs - susu kambing etawa
category_breadcrumb               makanan-minuman/produk-susu/susu-bubuk
prod_url               https://www.tokopedia.com/etawakustore-1/etawa...
old_price                                                            NaN
discounted_price                                                  210000
discount_percent                                                       0
stock                                                                  6
image_urls             [https://images.tokopedia.net/img/cache/200-sq...
review_count                                                           1
rating                                                               5.0
sold                                                                   4
ref                    device=desktop&rows=200&source=universal&sc=26...
sub_category                                       