In [1]:
import os
import pandas as pd

In [2]:
from rapidfuzz import process as proc
from rapidfuzz import fuzz

In [3]:
from tqdm import tqdm

In [4]:
from fastparquet import ParquetFile, write as parq_write

In [5]:
from src.utils import simple_process_item, count_common_digits, count_common_words, count_digit_share, remove_numbers, get_product_group, get_parent_group

[nltk_data] Downloading package punkt to /home/varsey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/varsey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_ru to
[nltk_data]     /home/varsey/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_ru is already up-to-
[nltk_data]       date!


In [6]:
# Loading data
df_incoming = pd.read_excel(f'{os.getcwd()}/data/drl-data.xlsx', sheet_name='Inventory Out')
df_outcoming = pd.read_excel(f'{os.getcwd()}/data/drl-data.xlsx', sheet_name='Inventory In')

In [7]:
df_outcoming['unit'] = df_incoming['Unit'].str.lower()
df_incoming['unit'] = df_incoming['Unit'].str.lower()

In [8]:
df_outcoming['items'] = df_outcoming['Product or Service Name'].apply(lambda x: simple_process_item(x))
df_incoming['items'] = df_incoming['Product or Service Name'].apply(lambda x: simple_process_item(x))

In [13]:
%%time

to_find = df_incoming['items'].to_list()[:300_000]
candidates = set(df_outcoming['items'].to_list())

outcoming, incoming1, incoming2, score1, score2, score_w, score_d = [], [], [], [], [], [], []
for item in tqdm(to_find[:]):
    res1 = proc.extract(
                item,
                candidates,
                scorer=fuzz.partial_token_sort_ratio,
                limit=1
    )[0]
    res2 = proc.extract(
                item,
                candidates,
                scorer=fuzz.partial_ratio,
                limit=1
    )[0]
    outcoming.append(item)
    incoming1.append(res1[0])
    incoming2.append(res2[0])
    score1.append(res1[1])
    score2.append(res2[1])
    if len(res1) > 0:
        score_w.append(count_common_words(item, res1[0]))
        score_d.append(count_common_digits(item, res1[0]) / count_digit_share(res1[0]))


100%|██████████| 300000/300000 [1:09:25<00:00, 72.02it/s] 

CPU times: user 1h 9min 18s, sys: 8.5 s, total: 1h 9min 27s
Wall time: 1h 9min 25s





In [9]:
comp = pd.DataFrame()
comp['right'] = outcoming
comp['left1'] = incoming1
comp['left2'] = incoming2

comp['score1'] = score1
comp['score2'] = score2
comp['score3'] = 2 * comp['score1'] *comp['score2'] /(comp['score1'] + comp['score2'] )

comp['score_w'] = score_w
comp['score_d'] = score_d
comp['score_d'] = comp['score_d'] /10

NameError: name 'outcoming' is not defined

In [None]:
parq_write(
    f'{os.getcwd()}/comp_300_000.parq',
    comp,
    compression='GZIP'
)

In [10]:
comp_parq = ParquetFile(f'{os.getcwd()}/comp_300_000.parq',)
comp = comp_parq.to_pandas()

In [11]:
sub_result = pd.concat(
    [
        df_incoming[:300_000],
        comp
    ],
    axis=1,
)
sub_result.shape

(300000, 14)

In [12]:
sub_result.sample(20)

Unnamed: 0,#,Product or Service Name,Unit,Quantity/Volume,unit,items,right,left1,left2,score1,score2,score3,score_w,score_d
9360,9361,PRESIDENT YAGLI 50 GR 1X60,EDED,10,eded,president yagli 50 gr 1 x 60,president yagli 50 gr 1 x 60,president 140 qr yaglı kruq,pendir president labne 400 grx 6,77.55102,71.698113,74.509804,0.4,0.0
111657,111658,QUICKBURY BUNS MEGABURGER SEASAME 4 PCS 300GR 1X7,EDED,21,eded,quickbury buns megaburger seasame 4 pcs 300 gr...,quickbury buns megaburger seasame 4 pcs 300 gr...,tee 19 3 4,sanagel plus 30 gr gel,66.666667,63.636364,65.116279,0.0,0.05
248382,248383,ZOLOTOY PETUSOK NAGGETS KLASSIK 300 GR 1X14(2728),EDED,95,eded,zolotoy petusok naggets klassik 300 gr 1 x 14 ...,zolotoy petusok naggets klassik 300 gr 1 x 14 ...,k 45 19,k 45 22,66.666667,71.428571,68.965517,0.0,0.0
31395,31396,SEVIMLI DAD XAMA KAUNAS 10% 200 GR 1X12,EDED,12,eded,sevimli dad xama kaunas 10 pct 200 gr 1 x 12,sevimli dad xama kaunas 10 pct 200 gr 1 x 12,sevimli dad xama kaunas 10 pct 200 gr 1 x 12,sevimli dad xama kaunas 10 pct 200 gr 1 x 12,100.0,100.0,100.0,1.0,0.425
132382,132383,PINAR PEYNIR TAZE KASAR 200 GR 1x12,EDED,4,eded,pinar peynir taze kasar 200 gr 1 x 12,pinar peynir taze kasar 200 gr 1 x 12,pinar gence qaz,pinar gence iyne,73.333333,69.230769,71.223022,0.333333,0.0
108345,108346,DORITOS TACO 130 GR 1 X 16,EDED,17,eded,doritos taco 130 gr 1 x 16,doritos taco 130 gr 1 x 16,doritos bbq 130 g x 16,doritos acılı 130 qr x 16,80.952381,80.0,80.473373,0.666667,0.272
262032,262033,"PINAR FRII KRMZ UZUM 0,25L 1x24 (4x6'li)",ADET.LT,12,adet.lt,pinar frii krmz uzum 0.25 l 1 x 24 4 x 6 li,pinar frii krmz uzum 0.25 l 1 x 24 4 x 6 li,7 up 0.5 lt x 12,7 up 0.5 lt x 12,60.0,68.75,64.07767,0.0,0.0
280984,280985,PRESIDENT KESME CHIZBURGER 150 GR 1X15,EDED,3,eded,president kesme chizburger 150 gr 1 x 15,president kesme chizburger 150 gr 1 x 15,president 150 qr chizburqer kəsmə 40 pct 1 x 15,premium,75.0,66.666667,70.588235,0.333333,0.407143
145783,145784,SUXARI MIX DENIZ 95 GR 1 X 16,EDED,16,eded,suxari mix deniz 95 gr 1 x 16,suxari mix deniz 95 gr 1 x 16,suxari xrustim mix sırnıy 95 q dsp x 16,suxari xrustim mix sırnıy 95 q dsp x 16,73.076923,62.068966,67.124632,0.4,0.62
114455,114456,ULKER PENDIR LABNE 200 GR 1X10,EDED,10,eded,ulker pendir labne 200 gr 1 x 10,ulker pendir labne 200 gr 1 x 10,pendir president labne 200 grx 10,pendir i̇çi̇m labne 200 grx 10,82.758621,82.142857,82.449589,0.666667,0.448


In [13]:
compilation_full = sub_result.merge(df_outcoming, how='left', left_on='left1', right_on='items')
compilation_full = compilation_full.drop_duplicates(subset=['#_x']) 
compilation_full.shape

(300000, 20)

In [14]:
columns_to_use = [
          'items_x', 'items_y',
          'left1', 'left2',
          'Quantity/Volume_x', 'unit_x',
          'Quantity/Volume_y', 'unit_y',
          'score1', 'score2', 'score3', 'score_w', 'score_d',
          'Product or Service Name_x', 'Product or Service Name_y'
]

In [15]:
compilation_full = compilation_full[columns_to_use]

In [16]:
df_outcoming.shape, df_incoming.shape, compilation_full.shape

((16947, 6), (937415, 6), (300000, 15))

In [17]:
comp_true = compilation_full[
    (compilation_full.score3 > 78) 
    & 
    (compilation_full.score_d > 0.17)
]

In [18]:
comp_doubt = compilation_full.drop(comp_true.index, axis=0)


In [19]:
# comp_doubt_x = comp_doubt.copy()
# comp_doubt_y = comp_doubt.copy()
# 
# comp_doubt_x['items_y']  = ''
# comp_doubt_x['Quantity/Volume_y'] = 0
# comp_doubt_x['unit_y'] = ''
# # comp_doubt['Product or Service Name_y'] = ''
# 
# comp_doubt_y['items_x']  = ''
# comp_doubt_y['Quantity/Volume_x'] = 0
# comp_doubt_y['unit_x'] = ''
# # comp_doubt['Product or Service Name_y'] = ''

# comp_doubt_x.shape, comp_doubt_y.shape

In [20]:
compilation_full.shape, comp_true.shape

((300000, 15), (85196, 15))

In [28]:
comp_true.shape[0] / compilation_full.shape[0]

0.28398666666666667

In [23]:
compilation_scored = comp_true

In [24]:
compilation_scored['unit_common'] = compilation_scored['unit_x'] + ' ' +  compilation_scored['unit_y']
compilation_scored['unit_common_set'] = compilation_scored['unit_common'].apply(lambda x: set(str(x).replace('.', ' ').split()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compilation_scored['unit_common'] = compilation_scored['unit_x'] + ' ' +  compilation_scored['unit_y']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compilation_scored['unit_common_set'] = compilation_scored['unit_common'].apply(lambda x: set(str(x).replace('.', ' ').split()))


In [25]:
compilation_scored['unit_common_str'] = compilation_scored['unit_common_set'].apply(
    lambda x: str(list(x)[0]) if len(list(x)) == 0 else ' '.join(list(x))
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compilation_scored['unit_common_str'] = compilation_scored['unit_common_set'].apply(


In [26]:
compilation_scored['quant_in'] = compilation_scored['Quantity/Volume_x'].astype('str').str.replace(',', '.').fillna(0).astype('float')
compilation_scored['quant_out'] = compilation_scored['Quantity/Volume_y'].astype('str').str.replace(',', '.').fillna(0).astype('float')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compilation_scored['quant_in'] = compilation_scored['Quantity/Volume_x'].astype('str').str.replace(',', '.').fillna(0).astype('float')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compilation_scored['quant_out'] = compilation_scored['Quantity/Volume_y'].astype('str').str.replace(',', '.').fillna(0).astype('float')


In [29]:
compilation_scored.shape

(85196, 20)

In [30]:
# x - incoming y - outcoming
compilation_scored[
    [
        'items_x', 'items_y',
        'score3', 'score1', 'score2', 'score_w', 'score_d',
        'Product or Service Name_y'
    ]
].sample(250)

Unnamed: 0,items_x,items_y,score3,score1,score2,score_w,score_d,Product or Service Name_y
2389449,sevimli dad kesmik yagsiz 0 pct 180 gr 1 x 1,sevimli dad kesmik yagsiz 0.3 pct 180 gr 1 x 1,97.122882,98.850575,95.454545,1.000000,0.411429,SEVIMLI DAD KESMIK YAGSIZ 0.3% 180 GR 1x1
2791818,milla yoqurt ciyelek 3.5 pct 115 gr 1 x 48,milla yoqurt gilas 3.5 pct 115 qr x 48,82.191781,78.947368,85.714286,0.666667,0.342857,Milla Yoqurt Gilas 3.5% 115qr *48
2364019,icimino sud bananli 180 ml 1 x 27,süd i̇çi̇mino bananlı 180 ml x 27,84.316186,87.500000,81.355932,0.000000,0.432000,"Süd ""İÇİMINO"" Bananlı 180 ML x 27"
1649170,sevimli dad kesmik kaunas 0.3 pct 180 gr 1 x 1,sevimli dad kesmik kaunas 0 pct 180 gr 1 x 1,97.122882,98.850575,95.454545,1.000000,0.453333,SEVIMLI DAD KESMIK KAUNAS 0% 180 GR1x1
4086086,sevimli dad kesmik kaunas 0.3 pct 180 gr 1 x 1,sevimli dad kesmik kaunas 0 pct 180 gr 1 x 1,97.122882,98.850575,95.454545,1.000000,0.453333,SEVIMLI DAD KESMIK KAUNAS 0% 180 GR1x1
...,...,...,...,...,...,...,...,...
1184240,sochnaya dolina alma 0.95 lt 1 x 12,mirinda alma 0.5 lt x 12,79.115765,73.913043,85.106383,0.333333,0.316667,Mirinda Alma 0.5 lt*12
4536431,lays rebrishki qril 150 gr 1 x 18,lays rebrışki qril 150 q dspx 18,78.045470,77.966102,78.125000,0.571429,0.416000,Lay's Rebrışki Qril 150q DSPX 18
4048471,doritos acili 74 gr 1 x 24,doritos acılı 74 qr x 24,83.125000,87.500000,79.166667,0.500000,0.380000,Doritos Acılı 74qr X24
1748739,suxari smetan 130 gr 1 x 24,suxari xrustim smetan 130 qr x 24,85.714286,92.307692,80.000000,0.800000,0.432000,Suxari Xrustim Smetan 130qr X24


In [31]:
compilation_scored['items_x_numberless'] = compilation_scored['items_x'].apply(lambda x: remove_numbers(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compilation_scored['items_x_numberless'] = compilation_scored['items_x'].apply(lambda x: remove_numbers(x))


In [32]:
compilation_scored['product_group'] = compilation_scored['items_x_numberless'].apply(lambda x: get_product_group(x))
compilation_scored['parent_group'] = compilation_scored['product_group'].apply(lambda x: get_parent_group(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compilation_scored['product_group'] = compilation_scored['items_x_numberless'].apply(lambda x: get_product_group(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compilation_scored['parent_group'] = compilation_scored['product_group'].apply(lambda x: get_parent_group(x))


In [33]:
compilation_scored.groupby(
    by=['parent_group', 'product_group', 'items_x', 'unit_common_str']
).agg(
    {'quant_in':'sum','quant_out':'sum'}
).sort_values(
    ['unit_common_str']
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,quant_in,quant_out
parent_group,product_group,items_x,unit_common_str,Unnamed: 4_level_1,Unnamed: 5_level_1
xama,president xama,president xama 30 pct.200 gr 1 x 8,cl eded adet,31.0,2920.0
xama,president xama,president xama 10 pct 350 gr 1 x 8,cl eded adet,35.0,2920.0
sud,sud icim,sud icim yarim yagli 1 lt 1 x 12,cl eded adet,346.0,5418.0
xama,president xama,president xama 15 pct 350 gr 1 x 8,cl eded adet,2140.0,144248.0
kefir,president kefir,president kefir 2.5 pct 450 gr 1 x 8,cl eded adet,1278.0,103368.0
...,...,...,...,...,...
qaymaq,president qaymaq,president qaymaq 200 gr 1 x 27,ədəd lt adet,6718.0,491400.0
kesmik,prezident kesmik,prezident kesmik 9 pct 200 gr,ədəd lt adet,282.0,3780.0
president,president tradisiya,president tradisiya kesmik 9 pct 250 gr,ədəd lt adet,128.0,1980.0
pro,pro canax,m.pro canax pendiri 500 gr,ədəd lt adet,4953.0,14.0


In [34]:
general_aggregate = compilation_scored.groupby(
    by=['parent_group', 'product_group', 'items_x', 'unit_common_str']
).agg(
    {'quant_in':'sum','quant_out':'sum'}
).sort_values(
    ['unit_common_str']
)

In [38]:
brand_agg = compilation_scored.groupby(['parent_group', 'product_group', 'items_x', 'unit_common_str']).agg({'quant_in': 'sum', 'quant_out': 'sum'}).reset_index()
brand_agg.rename(columns={'quant_in': 'brand_inventory_in', 'quant_out': 'brand_inventory_out'}, inplace=True)

result = pd.merge(brand_agg, general_aggregate, on='parent_group', how='left')

In [40]:
result.groupby(
    by=['parent_group', 'product_group', 'items_x', 'unit_common_str']
).agg(
    {'quant_in':'sum','quant_out':'sum', 'brand_inventory_in': 'max', 'brand_inventory_out': 'max'}
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,quant_in,quant_out,brand_inventory_in,brand_inventory_out
parent_group,product_group,items_x,unit_common_str,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,,7 up 0.5 lt 1 x 12,eded lt adet,1182.00,36360.0,146.0,10800.0
,,7 up 1 lt 1 x 12,eded lt adet,1182.00,36360.0,350.0,19800.0
,,7 up 2 lt 1 x 6,eded lt adet,1182.00,36360.0,686.0,5760.0
acili,doritos acili,doritos acili 130 gr 1 x 16,eded,57756.50,68674704.0,31578.5,30888000.0
acili,doritos acili,doritos acili 74 gr 1 x 24,eded,57756.50,68674704.0,26178.0,37786704.0
...,...,...,...,...,...,...,...
xama,sevimli xama,sevimli dad xama kaunas 10 pct 200 gr 1 x 12,eded,616299.02,60626273.0,97068.0,18825646.0
xama,sevimli xama,sevimli dad xama kaunasskaya 20 pct 200 gr 1 x 12,eded,616299.02,60626273.0,272818.0,13467693.0
xama,sevimli xama,sevimli dad xama kaunasskaya 20 pct 350 gr 1 x 12,eded,616299.02,60626273.0,115600.0,11232270.0
xama,sevimli xama,sevimli dad xama kaunasskaya 30 pct 200 gr 1 x 12,eded,616299.02,60626273.0,52494.0,4596900.0
