In [1]:
import os
import pandas as pd

In [2]:
from rapidfuzz import process as proc
from rapidfuzz import fuzz

In [3]:
from tqdm import tqdm

In [4]:
from fastparquet import ParquetFile, write as parq_write

In [5]:
# Load methods for text processing
from src.utils import (
    simple_process_item, count_common_digits, count_common_words, count_digit_share,
    remove_numbers, get_product_group, get_parent_group, get_excluded_list
)

[nltk_data] Downloading package punkt to /home/varsey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/varsey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_ru to
[nltk_data]     /home/varsey/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_ru is already up-to-
[nltk_data]       date!


## Load dataset and preprocess items names (lowercase and stopwords removal)

In [11]:
# Download data by url in 'data' folder
df_incoming = pd.read_csv(f'{os.getcwd()}/data/incoming.csv')
df_outcoming = pd.read_csv(f'{os.getcwd()}/data/outcoming.csv')

In [12]:
df_outcoming['unit'] = df_incoming['Unit'].str.lower()
df_incoming['unit'] = df_incoming['Unit'].str.lower()

In [13]:
exclude = get_excluded_list()

In [14]:
df_outcoming['items'] = df_outcoming['Product or Service Name'].apply(lambda x: simple_process_item(x, exclude))
df_incoming['items'] = df_incoming['Product or Service Name'].apply(lambda x: simple_process_item(x, exclude))

## Create lists of pairs for each item in df_incoming df with scores for each pair

### The cell below take up to 4hs to run. You can use preprocessed data below

In [15]:
%%time

to_find = df_incoming['items'].to_list()
candidates = set(df_outcoming['items'].to_list())

outcoming, incoming1, incoming2, score1, score2, score_w, score_d = [], [], [], [], [], [], []
for item in tqdm(to_find[:]):
    res1 = proc.extract(
                item,
                candidates,
                scorer=fuzz.partial_token_sort_ratio,
                limit=1
    )[0]
    res2 = proc.extract(
                item,
                candidates,
                scorer=fuzz.partial_ratio,
                limit=1
    )[0]
    outcoming.append(item)
    incoming1.append(res1[0])
    incoming2.append(res2[0])
    score1.append(res1[1])
    score2.append(res2[1])
    if len(res1) > 0:
        score_w.append(count_common_words(item, res1[0]))
        score_d.append(count_common_digits(item, res1[0]) / count_digit_share(res1[0]))


100%|██████████| 5000/5000 [00:09<00:00, 504.14it/s]

CPU times: user 9.9 s, sys: 23.4 ms, total: 9.93 s
Wall time: 9.92 s





## Create df with pairs of matched items and corresponing scores

In [16]:
comp = pd.DataFrame()
comp['right'] = outcoming
comp['left1'] = incoming1
comp['left2'] = incoming2

comp['score1'] = score1
comp['score2'] = score2
# Harmonical score for quality decision
comp['score3'] = 2 * comp['score1'] *comp['score2'] /(comp['score1'] + comp['score2'] )

comp['score_w'] = score_w
comp['score_d'] = score_d
# Metric for numbers in items name comparison
comp['score_d'] = comp['score_d'] /10

### Write preprocessed data to file to save time

In [17]:
parq_write(
    f'{os.getcwd()}/comp_full.parq',
    comp,
    compression='GZIP'
)

### Load preprocessed data from file

In [18]:
comp_parq = ParquetFile(f'{os.getcwd()}/comp_full.parq',)
comp = comp_parq.to_pandas()

### Concat incoming items with mathed pairs df

In [19]:
sub_result = pd.concat(
    [
        df_incoming,
        comp
    ],
    axis=1,
)
sub_result.shape

(5000, 15)

In [20]:
sub_result.sample(20)

Unnamed: 0.1,Unnamed: 0,#,Product or Service Name,Unit,Quantity/Volume,unit,items,right,left1,left2,score1,score2,score3,score_w,score_d
3146,200859,200860,SUPERFRESH PATATES 9X9 1000 GR 1X10,EDED,3,eded,superfresh patates 9 x 9 1000 gr 1 x 10,superfresh patates 9 x 9 1000 gr 1 x 10,superfresh patates 9 x 9 1000 gr 1 x 10,superfresh patates 9 x 9 1000 gr 1 x 10,100.0,100.0,100.0,1.0,0.333333
3256,204132,204133,FERSAN ELMA SIRKESI 50 CL CAM SISE 4% 1x12,ADET.CL,7,adet.cl,fersan elma sirkesi 50 cl cam sise 4 pct 1 x 12,fersan elma sirkesi 50 cl cam sise 4 pct 1 x 12,fersan elma sirkesi 50 cl cam sise 4 pct 1 x 12,fersan elma sirkesi 50 cl cam sise 4 pct 1 x 12,100.0,100.0,100.0,1.0,0.6
1589,838322,838323,PINAR SUT KIDO KARAMELLI 180 ML 1X27,ADET.ML,6,adet.ml,pinar sut kido karamelli 180 ml 1 x 27,pinar sut kido karamelli 180 ml 1 x 27,pinar sut kido kakaolu 180 ml 1 x 27,pinar sut kido kakaolu 180 ml 1 x 27,86.111111,86.111111,86.111111,0.666667,0.466667
1687,575291,575292,PINAR PEYNIR LABNE 400 GR 1x12,EDED,12,eded,pinar peynir labne 400 gr 1 x 12,pinar peynir labne 400 gr 1 x 12,pinar peynir labne 400 gr 1 x 12,pinar peynir labne 400 gr 1 x 12,100.0,100.0,100.0,1.0,0.416667
3694,256685,256686,SEVIMLI DAD KESMIK KAUNAS 0.3% 180 GR1x1,EDED,5,eded,sevimli dad kesmik kaunas 0.3 pct 180 gr 1 x 1,sevimli dad kesmik kaunas 0.3 pct 180 gr 1 x 1,sevimli dad kesmik kaunas 0.3 pct 180 gr 1 x 1,sevimli dad kesmik kaunas 0.3 pct 180 gr 1 x 1,100.0,100.0,100.0,1.0,0.514286
3411,317126,317127,PRESIDENT SIRIN K/M DOLCE VANIL 20% 200GR 1X12,EDED,6,eded,president sirin k dolce vanil 20 pct 200 gr 1 ...,president sirin k dolce vanil 20 pct 200 gr 1 ...,president sirin k dolce vanil 20 pct 200 gr 1 ...,president sirin k dolce vanil 20 pct 200 gr 1 ...,100.0,100.0,100.0,1.0,0.4875
941,839130,839131,SUPERFRESH BAMYA SIVRI 450 GR 1X16,EDED,32,eded,superfresh bamya sivri 450 gr 1 x 16,superfresh bamya sivri 450 gr 1 x 16,superfresh bamya sivri 450 gr 1 x 16,superfresh bamya sivri 450 gr 1 x 16,100.0,100.0,100.0,1.0,0.483333
4762,771266,771267,PRESIDENT KESME MAAZDAM 150 GR 1X15,EDED,10,eded,president kesme maazdam 150 gr 1 x 15,president kesme maazdam 150 gr 1 x 15,president kesme maazdam 150 gr 1 x 15,president kesme maazdam 150 gr 1 x 15,100.0,100.0,100.0,1.0,0.5
916,545481,545482,ULKER ICIMINO CIYELEK 45 GR 1X48,EDED,24,eded,ulker icimino ciyelek 45 gr 1 x 48,ulker icimino ciyelek 45 gr 1 x 48,ulker icimino ciyelek 45 gr 1 x 48,ulker icimino ciyelek 45 gr 1 x 48,100.0,100.0,100.0,1.0,0.54
683,219194,219195,SEVIMLI DAD KESMIK YAGSIZ 0% 180 GR 1x1,EDED,10,eded,sevimli dad kesmik yagsiz 0 pct 180 gr 1 x 1,sevimli dad kesmik yagsiz 0 pct 180 gr 1 x 1,sevimli dad kesmik yagsiz 0 pct 180 gr 1 x 1,sevimli dad kesmik yagsiz 0 pct 180 gr 1 x 1,100.0,100.0,100.0,1.0,0.566667


### Merge outcoming data to preprocessed pairs of matched products

In [21]:
compilation_full = sub_result.merge(df_outcoming, how='left', left_on='left1', right_on='items')
compilation_full = compilation_full.drop_duplicates(subset=['#_x']) 
compilation_full.shape

(5000, 22)

In [22]:
columns_to_use = [
          'items_x', 'items_y',
          'left1', 'left2',
          'Quantity/Volume_x', 'unit_x',
          'Quantity/Volume_y', 'unit_y',
          'score1', 'score2', 'score3', 'score_w', 'score_d',
          'Product or Service Name_x', 'Product or Service Name_y'
]

In [23]:
compilation_full = compilation_full[columns_to_use]

In [24]:
df_outcoming.shape, df_incoming.shape, compilation_full.shape

((5000, 7), (5000, 7), (5000, 15))

### Filter data by scores metrics to robust result

In [25]:
comp_true = compilation_full[
    (compilation_full.score3 > 78) 
    & 
    (compilation_full.score_d > 0.17)
]

### Exclude from pairs data we doubt according to the scores

In [26]:
comp_doubt = compilation_full.drop(comp_true.index, axis=0)


In [27]:
compilation_full.shape, comp_true.shape

((5000, 15), (4749, 15))

### Share of data we consider as matched correctly

In [28]:
comp_true.shape[0] / compilation_full.shape[0]

0.9498

In [29]:
compilation_scored = comp_true

### Preprocess units to see in the result table if they are the same for left and right item in matched pair

In [30]:
compilation_scored['unit_common'] = compilation_scored['unit_x'] + ' ' +  compilation_scored['unit_y']
compilation_scored['unit_common_set'] = compilation_scored['unit_common'].apply(lambda x: set(str(x).replace('.', ' ').split()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compilation_scored['unit_common'] = compilation_scored['unit_x'] + ' ' +  compilation_scored['unit_y']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compilation_scored['unit_common_set'] = compilation_scored['unit_common'].apply(lambda x: set(str(x).replace('.', ' ').split()))


In [31]:
compilation_scored['unit_common_str'] = compilation_scored['unit_common_set'].apply(
    lambda x: str(list(x)[0]) if len(list(x)) == 0 else ' '.join(list(x))
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compilation_scored['unit_common_str'] = compilation_scored['unit_common_set'].apply(


### Convert quantaties to float for aggregating later

In [32]:
compilation_scored['quant_in'] = compilation_scored['Quantity/Volume_x'].astype('str').str.replace(',', '.').fillna(0).astype('float')
compilation_scored['quant_out'] = compilation_scored['Quantity/Volume_y'].astype('str').str.replace(',', '.').fillna(0).astype('float')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compilation_scored['quant_in'] = compilation_scored['Quantity/Volume_x'].astype('str').str.replace(',', '.').fillna(0).astype('float')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compilation_scored['quant_out'] = compilation_scored['Quantity/Volume_y'].astype('str').str.replace(',', '.').fillna(0).astype('float')


In [33]:
compilation_scored.shape

(4749, 20)

In [34]:
# x - incoming y - outcoming
compilation_scored[
    [
        'items_x', 'items_y',
        'score3', 'score1', 'score2', 'score_w', 'score_d',
        'Product or Service Name_x', 'Product or Service Name_y'
    ]
].sample(250)

Unnamed: 0,items_x,items_y,score3,score1,score2,score_w,score_d,Product or Service Name_x,Product or Service Name_y
43290,3 jelaniya ketcup kabablig 450 gr 1 x 30,3 jelaniya ketcup kabablig 450 gr 1 x 30,100.0,100.0,100.0,1.0,0.457143,3 JELANIYA KETCUP KABABLIG 450 GR 1x30,3 JELANIYA KETCUP KABABLIG 450 GR 1x30
36562,kirlangic sizma zeytinyagi 1 lt 1 x 12,kirlangic sizma zeytinyagi 1 lt 1 x 12,100.0,100.0,100.0,1.0,0.775000,KIRLANGIC SIZMA ZEYTINYAGI 1 LT PET 1X12,KIRLANGIC SIZMA ZEYTINYAGI 1 LT PET 1X12
9607,3 jelaniya ketcup kabablig 250 gr 1 x 60,3 jelaniya ketcup kabablig 250 gr 1 x 60,100.0,100.0,100.0,1.0,0.457143,3 JELANIYA KETCUP KABABLIG 250 GR 1x60,3 JELANIYA KETCUP KABABLIG 250 GR 1x60
44612,pinar peynir taze kasar 200 gr 1 x 12,pinar peynir taze kasar 200 gr 1 x 12,100.0,100.0,100.0,1.0,0.483333,PINAR PEYNIR TAZE KASAR 200 GR 1x12,PINAR PEYNIR TAZE KASAR 200 GR 1x12
2704,sevimli dad kesmik yagli 9 pct 180 gr 1 x 1,sevimli dad kesmik yagli 9 pct 180 gr 1 x 1,100.0,100.0,100.0,1.0,0.550000,SEVIMLI DAD KESMIK YAGLI 9% 180 GR 1x1,SEVIMLI DAD KESMIK YAGLI 9% 180 GR 1x1
...,...,...,...,...,...,...,...,...,...
10297,lays ridged pendir ile sogan 150 gr 1 x 18,lays ridged pendir ile sogan 150 gr 1 x 18,100.0,100.0,100.0,1.0,0.550000,LAYS RIDGED PENDIR ILE SOGAN 150 GR 1 x 18,LAYS RIDGED PENDIR ILE SOGAN 150 GR 1 x 18
29057,doritos acili 130 gr 1 x 16,doritos acili 130 gr 1 x 16,100.0,100.0,100.0,1.0,0.350000,DORITOS ACILI 130 GR 1 x 16,DORITOS ACILI 130 GR 1 x 16
34119,icimino sud sokoladli 180 ml 1 x 27,icimino sud sokoladli 180 ml 1 x 27,100.0,100.0,100.0,1.0,0.466667,ICIMINO SUD SOKOLADLI 180 ML 1X27,ICIMINO SUD SOKOLADLI 180 ML 1X27
11949,sevimli dad kefir 2.5 pct 500 gr 1 x 30,sevimli dad kefir 2.5 pct 500 gr 1 x 30,100.0,100.0,100.0,1.0,0.375000,SEVIMLI DAD KEFIR 2.5% 500 GR 1X30,SEVIMLI DAD KEFIR 2.5% 500 GR 1X30


### Get groups from preprocessed items names

In [35]:
compilation_scored['items_x_numberless'] = compilation_scored['items_x'].apply(lambda x: remove_numbers(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compilation_scored['items_x_numberless'] = compilation_scored['items_x'].apply(lambda x: remove_numbers(x))


In [36]:
compilation_scored['product_group'] = compilation_scored['items_x_numberless'].apply(lambda x: get_product_group(x))
compilation_scored['parent_group'] = compilation_scored['product_group'].apply(lambda x: get_parent_group(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compilation_scored['product_group'] = compilation_scored['items_x_numberless'].apply(lambda x: get_product_group(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compilation_scored['parent_group'] = compilation_scored['product_group'].apply(lambda x: get_parent_group(x))


### Aggregated result by items and their groups

In [37]:
general_aggregate = compilation_scored.groupby(
    by=['parent_group', 'product_group', 'Product or Service Name_x', 'Product or Service Name_y', 'unit_common_str']
).agg(
    {'quant_in':'sum','quant_out':'sum'}
).sort_values(
    ['unit_common_str']
)

general_aggregate

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,quant_in,quant_out
parent_group,product_group,Product or Service Name_x,Product or Service Name_y,unit_common_str,Unnamed: 5_level_1,Unnamed: 6_level_1
nane,bagdat nane,BAGDAT NANE 30 GR 1x15,BAGDAT NANE 30 GR 1x14,adet cl eded,15.0,14.0
elma,fersan elma,FERSAN ELMA SIRKESI 50 CL CAM SISE 4% 1x12,FERSAN ELMA SIRKESI 50 CL CAM SISE 4% 1x12,adet cl eded,2000.0,456.0
dovga,sevimli dovga,SEVIMLI DAD DOVGA 2.5% 900 GR 1x12,SEVIMLI DAD DOVGA 2.5% 900 GR 1x12,adet cl eded,198.0,16.0
nane,bagdat nane,BAGDAT NANE 30 GR 1x14,BAGDAT NANE 30 GR 1x14,adet cl eded,43.0,28.0
nar,fersan nar,FERSAN NAR EKSILI 25 CL CAM SISE 1x12,FERSAN NAR EKSILI 25 CL CAM SISE 1x12,adet cl eded,39.0,36.0
...,...,...,...,...,...,...
kuzya,kuzya kukuruz,KUZYA KUKURUZ 140 GR,KUZYA KUKURUZ 140 GR 1X12,ədəd eded,480.0,11.0
dolcia,dolcia puding,DOLCIA PUDING BANANLI 100GR 1X4X6,DOLCIA PUDING BANANLI 100GR 1X4X6,ədəd eded,48.0,96.0
kuzya,kuzya kukuruz,KUZYA KUKURUZ 140 GR AL 38 GR(1x30) HDY 1X1,KUZYA KUKURUZ 140 GR AL 38 GR(1x30) HDY 1X1,ədəd eded,70.0,24.0
ulker,ulker puding,ULKER PUDING ALPELLA SOKOLADLI 100 GR 1X24,ULKER PUDING ALPELLA SOKOLADLI 100 GR 1X24,ədəd eded,1670.0,378.0


### Final table with parent groups (brand) invetory balances

In [38]:
brand_agg = compilation_scored.groupby(
    ['parent_group', 'product_group', 'Product or Service Name_x', 'unit_common_str']
).agg(
    {'quant_in': 'sum', 'quant_out': 'sum'}
).reset_index()

brand_agg.rename(columns={'quant_in': 'brand_inventory_in', 'quant_out': 'brand_inventory_out'}, inplace=True)

result = pd.merge(brand_agg, general_aggregate, on='parent_group', how='left')

In [39]:
result.groupby(
    by=['parent_group', 'product_group', 'Product or Service Name_x', 'unit_common_str']
).agg(
    {'quant_in':'sum','quant_out':'sum', 'brand_inventory_in': 'max', 'brand_inventory_out': 'max'}
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,quant_in,quant_out,brand_inventory_in,brand_inventory_out
parent_group,product_group,Product or Service Name_x,unit_common_str,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
aci,berrak aci,BERRAK ACI SUS BIBERI TURSUSU 370 ML 1X12,ml adet eded,145.0,36.0,145.0,36.0
acili,doritos acili,DORITOS ACILI 130 GR 1 x 16,eded,1095.0,502.0,426.0,464.0
acili,doritos acili,DORITOS ACILI 74 GR 1 x 24,eded,1095.0,502.0,669.0,38.0
adjika,qlavprodukt adjika,QLAVPRODUKT ADJIKA AROMATNAYA 170 GR,ədəd eded,1022.0,174.0,38.0,90.0
adjika,qlavprodukt adjika,QLAVPRODUKT ADJIKA AROMATNAYA 170 QR 1X15,eded,1022.0,174.0,879.0,30.0
...,...,...,...,...,...,...,...
zolotoy,zolotoy petusok,ZOLOTOY PETUSOK NAGGETS TOYUQLU 300 GR 1X12(4639),eded,387.0,602.0,89.0,28.0
zolotoy,zolotoy petusok,ZOLOTOY PETUSOK NAGGETS VKUS. 300 GR 1x12(4719),eded,387.0,602.0,49.0,15.0
zolotoy,zolotoy petusok,ZOLOTOY PETUSOK NAGGETS XIRTXIRT 300 GR 1X12(4562),eded,387.0,602.0,22.0,168.0
zolotoy,zolotoy petusok,ZOLOTOY PETUSOK STRIPS VKUSNYASKI 300 GR1x12(4718),eded,387.0,602.0,7.0,24.0


In [40]:
general_aggregate.to_csv('general_aggregate.csv')

In [41]:
result.rename(
    columns={
        'parent_group': 'Parent Group Name',
        'product_group': 'Product Group',
        'brand_inventory_in': 'Group Inventory In Quantity',
        'brand_inventory_out': 'Group Inventory Out Quantity',
        'Product or Service Name_x': 'Individual Items',
        'unit_common_str': 'Product Unit',
        'quant_in': 'Inventory In Quantity',
        'quant_out': 'Inventory Out Quantity',
    },
).groupby(
    by=['Parent Group Name', 'Product Group', 'Individual Items', 'Product Unit',]
).agg(
    {'Inventory In Quantity':'sum', 'Inventory Out Quantity' :'sum', 'Group Inventory In Quantity': 'max', 'Group Inventory Out Quantity': 'max'}
).to_csv('result.csv')

In [42]:
result

Unnamed: 0,parent_group,product_group,Product or Service Name_x,unit_common_str,brand_inventory_in,brand_inventory_out,quant_in,quant_out
0,aci,berrak aci,BERRAK ACI SUS BIBERI TURSUSU 370 ML 1X12,ml adet eded,145.0,36.0,145.0,36.0
1,acili,doritos acili,DORITOS ACILI 130 GR 1 x 16,eded,426.0,464.0,426.0,464.0
2,acili,doritos acili,DORITOS ACILI 130 GR 1 x 16,eded,426.0,464.0,669.0,38.0
3,acili,doritos acili,DORITOS ACILI 74 GR 1 x 24,eded,669.0,38.0,426.0,464.0
4,acili,doritos acili,DORITOS ACILI 74 GR 1 x 24,eded,669.0,38.0,669.0,38.0
...,...,...,...,...,...,...,...,...
23508,zolotoy,zolotoy petusok,ZOLOTOY PETUSOK SUXARI TOYUQ STEYK 280GR1X12(4...,eded,57.0,16.0,57.0,16.0
23509,zolotoy,zolotoy petusok,ZOLOTOY PETUSOK SUXARI TOYUQ STEYK 280GR1X12(4...,eded,57.0,16.0,7.0,24.0
23510,zolotoy,zolotoy petusok,ZOLOTOY PETUSOK SUXARI TOYUQ STEYK 280GR1X12(4...,eded,57.0,16.0,38.0,105.0
23511,zolotoy,zolotoy petusok,ZOLOTOY PETUSOK SUXARI TOYUQ STEYK 280GR1X12(4...,eded,57.0,16.0,54.0,42.0


In [43]:
len(compilation_scored['Product or Service Name_x'].unique())

1039

In [44]:
len(compilation_scored['Product or Service Name_y'].unique())

886

In [45]:
len(df_incoming['Product or Service Name'].unique())

1236

In [46]:
len(df_outcoming['Product or Service Name'].unique())

1208

In [47]:
len(compilation_scored['Product or Service Name_x'].unique()) / len(compilation_scored['Product or Service Name_y'].unique()), len(df_incoming['Product or Service Name'].unique()) / len(df_outcoming['Product or Service Name'].unique())

(1.172686230248307, 1.0231788079470199)