In [1]:
import os
import pandas as pd

In [2]:
from rapidfuzz import process as proc
from rapidfuzz import fuzz

In [3]:
from tqdm import tqdm

In [4]:
from fastparquet import ParquetFile, write as parq_write

In [5]:
from src.utils import (
    simple_process_item, count_common_digits, count_common_words, count_digit_share,
    remove_numbers, get_product_group, get_parent_group, get_excluded_list
)

[nltk_data] Downloading package punkt to /home/varsey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/varsey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_ru to
[nltk_data]     /home/varsey/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_ru is already up-to-
[nltk_data]       date!


In [6]:
# Loading data
df_incoming = pd.read_excel(f'{os.getcwd()}/data/drl-data.xlsx', sheet_name='Inventory Out')
df_outcoming = pd.read_excel(f'{os.getcwd()}/data/drl-data.xlsx', sheet_name='Inventory In')

In [7]:
df_outcoming['unit'] = df_incoming['Unit'].str.lower()
df_incoming['unit'] = df_incoming['Unit'].str.lower()

In [8]:
exclude = get_excluded_list()

In [9]:
df_outcoming['items'] = df_outcoming['Product or Service Name'].apply(lambda x: simple_process_item(x, exclude))
df_incoming['items'] = df_incoming['Product or Service Name'].apply(lambda x: simple_process_item(x, exclude))

In [None]:
%%time

to_find = df_incoming['items'].to_list()
candidates = set(df_outcoming['items'].to_list())

outcoming, incoming1, incoming2, score1, score2, score_w, score_d = [], [], [], [], [], [], []
for item in tqdm(to_find[:]):
    res1 = proc.extract(
                item,
                candidates,
                scorer=fuzz.partial_token_sort_ratio,
                limit=1
    )[0]
    res2 = proc.extract(
                item,
                candidates,
                scorer=fuzz.partial_ratio,
                limit=1
    )[0]
    outcoming.append(item)
    incoming1.append(res1[0])
    incoming2.append(res2[0])
    score1.append(res1[1])
    score2.append(res2[1])
    if len(res1) > 0:
        score_w.append(count_common_words(item, res1[0]))
        score_d.append(count_common_digits(item, res1[0]) / count_digit_share(res1[0]))


  1%|          | 6524/937415 [01:31<3:43:39, 69.37it/s]

In [None]:
comp = pd.DataFrame()
comp['right'] = outcoming
comp['left1'] = incoming1
comp['left2'] = incoming2

comp['score1'] = score1
comp['score2'] = score2
comp['score3'] = 2 * comp['score1'] *comp['score2'] /(comp['score1'] + comp['score2'] )

comp['score_w'] = score_w
comp['score_d'] = score_d
comp['score_d'] = comp['score_d'] /10

In [None]:
parq_write(
    f'{os.getcwd()}/comp_full.parq',
    comp,
    compression='GZIP'
)

In [None]:
comp_parq = ParquetFile(f'{os.getcwd()}/comp_full.parq',)
comp = comp_parq.to_pandas()

In [None]:
sub_result = pd.concat(
    [
        df_incoming,
        comp
    ],
    axis=1,
)
sub_result.shape

In [None]:
sub_result.sample(20)

In [None]:
compilation_full = sub_result.merge(df_outcoming, how='left', left_on='left1', right_on='items')
compilation_full = compilation_full.drop_duplicates(subset=['#_x']) 
compilation_full.shape

In [None]:
columns_to_use = [
          'items_x', 'items_y',
          'left1', 'left2',
          'Quantity/Volume_x', 'unit_x',
          'Quantity/Volume_y', 'unit_y',
          'score1', 'score2', 'score3', 'score_w', 'score_d',
          'Product or Service Name_x', 'Product or Service Name_y'
]

In [None]:
compilation_full = compilation_full[columns_to_use]

In [None]:
df_outcoming.shape, df_incoming.shape, compilation_full.shape

In [None]:
comp_true = compilation_full[
    (compilation_full.score3 > 78) 
    & 
    (compilation_full.score_d > 0.17)
]

In [None]:
comp_doubt = compilation_full.drop(comp_true.index, axis=0)


In [None]:
compilation_full.shape, comp_true.shape

In [None]:
comp_true.shape[0] / compilation_full.shape[0]

In [None]:
compilation_scored = comp_true

In [None]:
compilation_scored['unit_common'] = compilation_scored['unit_x'] + ' ' +  compilation_scored['unit_y']
compilation_scored['unit_common_set'] = compilation_scored['unit_common'].apply(lambda x: set(str(x).replace('.', ' ').split()))

In [None]:
compilation_scored['unit_common_str'] = compilation_scored['unit_common_set'].apply(
    lambda x: str(list(x)[0]) if len(list(x)) == 0 else ' '.join(list(x))
)

In [None]:
compilation_scored['quant_in'] = compilation_scored['Quantity/Volume_x'].astype('str').str.replace(',', '.').fillna(0).astype('float')
compilation_scored['quant_out'] = compilation_scored['Quantity/Volume_y'].astype('str').str.replace(',', '.').fillna(0).astype('float')

In [None]:
compilation_scored.shape

In [None]:
# x - incoming y - outcoming
compilation_scored[
    [
        'items_x', 'items_y',
        'score3', 'score1', 'score2', 'score_w', 'score_d',
        'Product or Service Name_y'
    ]
].sample(250)

In [None]:
compilation_scored['items_x_numberless'] = compilation_scored['items_x'].apply(lambda x: remove_numbers(x))

In [None]:
compilation_scored['product_group'] = compilation_scored['items_x_numberless'].apply(lambda x: get_product_group(x))
compilation_scored['parent_group'] = compilation_scored['product_group'].apply(lambda x: get_parent_group(x))

In [None]:
general_aggregate = compilation_scored.groupby(
    by=['parent_group', 'product_group', 'items_x', 'unit_common_str']
).agg(
    {'quant_in':'sum','quant_out':'sum'}
).sort_values(
    ['unit_common_str']
)

general_aggregate

In [38]:
brand_agg = compilation_scored.groupby(
    ['parent_group', 'product_group', 'items_x', 'unit_common_str']
).agg(
    {'quant_in': 'sum', 'quant_out': 'sum'}
).reset_index()

brand_agg.rename(columns={'quant_in': 'brand_inventory_in', 'quant_out': 'brand_inventory_out'}, inplace=True)

result = pd.merge(brand_agg, general_aggregate, on='parent_group', how='left')

In [40]:
result.groupby(
    by=['parent_group', 'product_group', 'items_x', 'unit_common_str']
).agg(
    {'quant_in':'sum','quant_out':'sum', 'brand_inventory_in': 'max', 'brand_inventory_out': 'max'}
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,quant_in,quant_out,brand_inventory_in,brand_inventory_out
parent_group,product_group,items_x,unit_common_str,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,,7 up 0.5 lt 1 x 12,eded lt adet,1182.00,36360.0,146.0,10800.0
,,7 up 1 lt 1 x 12,eded lt adet,1182.00,36360.0,350.0,19800.0
,,7 up 2 lt 1 x 6,eded lt adet,1182.00,36360.0,686.0,5760.0
acili,doritos acili,doritos acili 130 gr 1 x 16,eded,57756.50,68674704.0,31578.5,30888000.0
acili,doritos acili,doritos acili 74 gr 1 x 24,eded,57756.50,68674704.0,26178.0,37786704.0
...,...,...,...,...,...,...,...
xama,sevimli xama,sevimli dad xama kaunas 10 pct 200 gr 1 x 12,eded,616299.02,60626273.0,97068.0,18825646.0
xama,sevimli xama,sevimli dad xama kaunasskaya 20 pct 200 gr 1 x 12,eded,616299.02,60626273.0,272818.0,13467693.0
xama,sevimli xama,sevimli dad xama kaunasskaya 20 pct 350 gr 1 x 12,eded,616299.02,60626273.0,115600.0,11232270.0
xama,sevimli xama,sevimli dad xama kaunasskaya 30 pct 200 gr 1 x 12,eded,616299.02,60626273.0,52494.0,4596900.0
