In [54]:
# necessary
import pandas as pd
import numpy as np
import pickle
from tqdm.auto import tqdm
import os
import ast
import re
import time
import copy
import difflib
from itertools import chain, repeat


from jellyfish import jaro_distance, jaro_winkler
from soynlp.hangle import jamo_levenshtein


import sys
# cur_dir = os.path.dirname(os.path.realpath(__file__))
# root = os.path.abspath(os.path.join(cur_dir, os.pardir, os.pardir))
# src = os.path.abspath(os.path.join(cur_dir, os.pardir))
# temp = root + '/temp'
# sys.path.append(root)
# sys.path.append(src)
# sys.path.append(temp)
    
# from access_database import access_db

def title_comparison(word_0: str, word_1: str) -> tuple:
    
    ''' 
    Compare Product Titles 
    '''
    
    non_sp_0 = word_0.replace(' ', '')
    non_sp_1 = word_1.replace(' ', '')

    len_0 = len(non_sp_0)
    len_1 = len(non_sp_1)
    max_len = max(len_0, len_1)
    min_len = min(len_0, len_1)
    
    # 토큰 중복 비율 계산
    word_sp_0 = word_0.split(' ')
    word_sp_1 = word_1.split(' ')
    
    '''string 종속 여부 체크'''

    leng = min(len(word_sp_0), len(word_sp_1))
    word_set = list(set(word_sp_0) & set(word_sp_1))
    dep_cnt = len(word_set)
    dep_ratio = dep_cnt / leng

    # calculate similarity
    cost = jamo_levenshtein(non_sp_0, non_sp_1)
    sim = (max_len - cost) / max_len

    return round(dep_ratio, 4), dep_cnt, round(sim, 4), max_len, min_len


def prd_mapper(input_data_0: pd.DataFrame, input_data_1: pd.DataFrame) -> pd.DataFrame:
    
    '''  
    Compare Product Titles after Grouping Brands and Categories
    
    Input Data 
    - input_data_0: Mapping Criteria Table
    - input_data_1: Mapping Target Table
    
    ** necessary columns: ['id', 'brand_name', 'product_name', 'category']
    
    Output Data 
    - compared_df: Product name comparison table 
    
    '''
    
    
    # Required column for product name comparison
    columns_necessary = ['id', 'brand_name', 'product_name', 'category', 'table_name']
    df_notnull_0 = input_data_0[input_data_0.product_name.notnull()].loc[:, columns_necessary].reset_index(drop=True)
    df_notnull_0.loc[:, 'brand_name'] = df_notnull_0.brand_name.str.replace(' ', '').str.lower()
    df_notnull_1 = input_data_1[input_data_1.product_name.notnull()].loc[:, columns_necessary].reset_index(drop=True)
    df_notnull_1.loc[:, 'brand_name'] = df_notnull_1.brand_name.str.replace(' ', '').str.lower()
    
    brands = df_notnull_1.brand_name.unique()
    categories = df_notnull_1.category.unique()

    columns = ['id_1', 'id_0', 'title_1', 'title_0', 'brand_name', 'table_name', 'category_1', 'category_0', 'similarity', 'dependency', 'max_length', 'min_length', 'dependency_ratio', 'dependency_count']
    compared_df = pd.DataFrame(columns=columns)

    # group by brand_name
    brd_grp_0 = df_notnull_0.groupby('brand_name')
    brd_grp_1 = df_notnull_1.groupby('brand_name')
    for brand in tqdm(brands):
        df_brd_1 = brd_grp_1.get_group(brand).reset_index(drop=True)
        try:
            df_brd_0 = brd_grp_0.get_group(brand).reset_index(drop=True)
    
        # 매핑 기준 테이블에 해당 브랜드가 존재하지 않는 경우 
        except KeyError:
            continue
        
        for idx_1 in range(len(df_brd_1)):
            id_1, categ_1, title_1, tbl = df_brd_1.loc[idx_1, ['id', 'category', 'product_name', 'table_name']]

            # brand grouping
            df = pd.DataFrame(columns=columns)
            ck = 0
            for idx_0 in range(len(df_brd_0)):
                id_0, categ_0, title_0 = df_brd_0.loc[idx_0, ['id', 'category', 'product_name']]

                # category grouping
                if categ_1 == categ_0:
                    # product name comparing
                    compare_output = title_comparison(title_1, title_0)
                    sim = compare_output[0]

                    # similarity = 1 -> break
                    if sim == 1:
                        compared_df.loc[len(compared_df)] = (id_1, id_0, title_1, title_0, brand, tbl, categ_1, categ_0) + compare_output
                        ck = 1
                        break

                    else:
                        df.loc[len(df)] = (id_1, id_0, title_1, title_0, brand, tbl, categ_1, categ_0) + compare_output

                else:
                    pass

            # data assigning to DataFrame
            if ck == 1:
                pass

            elif len(df) == 0:
                pass

            else:
                compared_df = pd.concat([compared_df, df])
    
    compared_df = compared_df.reset_index(drop=True)
                
    return compared_df


def select_mapped_prd(input_data: pd.DataFrame) -> pd.DataFrame:
    
    '''  
    Select Mapped Products by Criteria You Define
    
    Input Data 
    - input_data: Product name comparison table
    
    ** necessary columns = ['id_1', 'id_0', 'title_1', 'title_0', 'brand_name', 'table_name', 'category_1', 'category_0', 'similarity', 'dependency', 'max_length', 'min_length', 'dependency_ratio', 'dependency_count']
    
    Output Data 
    - mapped_df: Product Name Mapping Complete Table
    
    '''
    
    params = {
        'min_length': 6, # product name minimum length
        'min_token': 3, # product name token minimum length
        'levenshtein_similarity': round(5/6, 4) # similarity minimum value
    }

    min_len = params['min_length']
    min_tk = params['min_token']
    min_sim = params['levenshtein_similarity']
    
    compared_df = input_data.copy()
    # mapping group 0: simimlarity == 1 & min_length >= min_len
    grp_0 = compared_df[(compared_df.similarity==1) & (compared_df.min_length>=min_len)]
    grp_sim = grp_0.sort_values(by=['id_1', 'dependency_ratio', 'dependency_count', 'similarity'], ascending=False).reset_index(drop=True)

    # mapping group 1: simimlarity != 1 & dependency_ratio = 1 & min_length >= min_len
    grp_1 = compared_df[(compared_df.similarity!=1) & (compared_df.dependency_ratio==1) & (compared_df.min_length>=min_len)].sort_values(by=['id_1', 'dependency_count', 'similarity'], ascending=False)
    grp_dedup = grp_1.drop_duplicates(subset=['id_1'], keep='first').reset_index(drop=True)

    # mapping group 2: simimlarity != 1 & dependency_ratio != 1 & min_length >= min_len
    df_dedup = pd.concat([compared_df, grp_sim, grp_dedup]).drop_duplicates(subset=['id_1'], keep=False).reset_index(drop=True)
    grp_2 = df_dedup[df_dedup.min_length>=min_len].reset_index(drop=True)

    # calculate levenshtein distance
    for idx in tqdm(range(len(grp_2))):
        s0 = grp_2.loc[idx, 'title_0'].replace(' ', '')
        s1 = grp_2.loc[idx, 'title_1'].replace(' ', '')
        sim = grp_2.loc[idx, 'similarity']
        winkler = jaro_winkler(s0, s1)
        grp_2.loc[idx, 'jaro_winker'] = round(winkler, 4)
    
    # mapping group 3: simimlarity != 1 & dependency_ratio != 1 & min_length >= min_len & levenshtein_similarity >= min_sim
    grp_levenshtein = grp_2.loc[(grp_2.similarity>=min_sim)].reset_index(drop=True)

    mapped_df = pd.concat([grp_sim, grp_dedup, grp_levenshtein]).sort_values('id_1').reset_index(drop=True)

    return mapped_df


def md_map_tbl(input_data: pd.DataFrame) -> pd.DataFrame:
    
    # with open('./config/info_table_id_dict.txt', 'rb') as f:
    #     info_table_id_dict = pickle.load(f)    
    # table_id = info_table_id_dict[table_name]
    
    mapping_table = pd.DataFrame(columns=['glowpick_product_info_final_version_id', 'mapped_id', 'table_name'])
    for table_name in input_data.table_name.unique():
        
        mapped_df = input_data.loc[input_data.table_name==table_name].reset_index(drop=True)

        for id_0 in tqdm(mapped_df.id_0.unique()):

            ids = mapped_df.loc[mapped_df.id_0==id_0, 'id_1'].values.tolist()

            if len(ids) == 1:
                ids = str(ids[0])

            else:
                ids = str(ids)

            mapping_table.loc[len(mapping_table)] = int(id_0), ids, table_name

    
    return mapping_table


def concat_map_tbl():
    
    ''' new mapping table concat '''

    mapping_table = pd.DataFrame()
    
    files = [f for f in os.listdir(temp) if re.match(r'mapping_table_[0-9].csv', f)]
    for table in files:
        tbl = pd.read_csv(temp + f'/{table}')
        mapping_table = pd.concat([mapping_table, tbl])
    mapping_table = mapping_table.reset_index(drop=True)
    
    mapping_table.to_csv(temp + '/mapping_table.csv', index=False)
    
    return None


def update_map_tbl(user_name, password, db_name):
    
    ''' update mapping table '''
    db = access_db.AccessDataBase(user_name, password, db_name)
    
    # get existing mapping table to db
    map_tbl_ex = db.get_tbl(db_name, 'naver_glowpick_mapping_table', 'all') 
    # get new mapping table to dir(temp)
    map_tbl_new = pd.read_csv(temp + '/mapping_table.csv')
    
        
    df_concat = pd.concat([map_tbl_ex, map_tbl_new]).reset_index(drop=True)

    # updated mapping table
    map_tbl = pd.DataFrame(columns=map_tbl_new.columns)

    for tbl in df_concat.table_name.unique():
        df_tbl = df_concat[df_concat.table_name==tbl]
        
        for id_ in tqdm(df_tbl.glowpick_product_info_final_version_id.unique()):
            mapped_id = df_tbl.loc[df_tbl.glowpick_product_info_final_version_id==id_, 'mapped_id']
            
            mapped_ids = []
            for ids in mapped_id:
                if ids[0] == '[':
                    ids = ast.literal_eval(ids)
                    mapped_ids += ids
                
                else:
                    mapped_ids.append(int(ids))
            
            mapped_ids = list(set(mapped_ids))
            if len(mapped_ids) == 0:
                print('wrn')
                break
            
            elif len(mapped_ids) == 1:
                mapped_ids = str(mapped_ids[0])

            else:
                mapped_ids = str(mapped_ids)

                
            map_tbl.loc[len(map_tbl)] = int(id_), str(mapped_ids), str(tbl)
            
    map_tbl = map_tbl.sort_values(by=['glowpick_product_info_final_version_id', 'table_name']).reset_index(drop=True)
    map_tbl.to_csv(temp + '/naver_glowpick_mapping_table.csv', index=False)

    # dup check
    dup = map_tbl[map_tbl.duplicated(subset=['glowpick_product_info_final_version_id', 'table_name'], keep=False)]
    if len(dup) == 0:
        comment = 'Mapping table update complete!'
    
    else:
        comment = 'Duplicate check required!'
    
    unq_prd_0 = len(map_tbl.glowpick_product_info_final_version_id.unique())
    unq_prd_1 = (map_tbl.mapped_id.str.count(',') + 1).sum()
    
    return comment, unq_prd_0, unq_prd_1
    
    
def upload_map_tbl(user_name, password, db_name, table_name, columns):
    
    ''' upload mapping table 
    
    <업로드 조건> 
    ** 기존 매핑 테이블 오늘 날짜로 백업 완료
    ** db에서 매핑테이블 생성 및 컬럼명 일치 여부 확인 
     
    '''
    db = access_db.AccessDataBase(user_name, password, db_name)
    
    map_tbl = pd.read_csv(temp + '/naver_glowpick_mapping_table.csv').loc[:, columns]
    
    db.engine_upload(map_tbl, db_name, table_name)
    
    comment = 'Mapping table upload is complete!'
    
    return comment

In [63]:
df_0 = pd.read_csv('deprepro_categ_0.csv')
df_1 = pd.read_csv('deprepro_categ_1.csv')

# concat, drop category null, length condition check
df_concat = pd.concat([df_0, df_1])
df_concat_ = df_concat[df_concat.category.notnull()].reset_index(drop=True)
df_concat_.loc[:, 'product_name_'] = df_concat_.product_name.str.replace(' ', '')
df_concat_.loc[:, 'length'] = df_concat_.product_name_.str.len()
df_len = df_concat_[df_concat_.length >= 6].reset_index(drop=True)

# find duplicate
subset = ['brand_name', 'category', 'product_name_']
df_dup = df_len[df_len.duplicated(subset=subset, keep=False)]

# grouping 
df_grp = df_dup.groupby(subset)
grp_index = df_grp.size().index

# 브랜드, 카테고리, 상품명 모두 일치하는 상품들 그룹핑 후 매핑 
columns=['glowpick_product_info_final_version_id', 'mapped_id', 'table_name'] 
df_map = pd.DataFrame(columns=columns)

for idx in tqdm(grp_index):
    # 하나의 (brand_name, category, product_name) 유니크 그룹
    df = df_grp.get_group(idx)
    
    # 글로우픽 기준으로 매핑 작업 
    if 'glowpick_product_info_final_version' in df.table_name.tolist():
        id_ = df.loc[df.table_name=='glowpick_product_info_final_version'].id.tolist()[0]        
        tbls = df.loc[df.table_name!='glowpick_product_info_final_version'].table_name.unique().tolist()
        
        for tbl in tbls:
            mapped_ids = df.loc[df.table_name==tbl].id.tolist()
            
            if len(mapped_ids) == 0:
                continue

            elif len(mapped_ids) == 1:
                mapped_id = str(mapped_ids[0])

            else:
                mapped_id = str(mapped_ids)

            df_map.loc[len(df_map)] = int(id_), str(mapped_id), str(tbl)
            
    else:
        pass

# sorting
df_map = df_map.sort_values(by=['glowpick_product_info_final_version_id', 'table_name'])

  0%|          | 0/28078 [00:00<?, ?it/s]

In [64]:
len0 = len(df_map[df_map.duplicated(subset=['glowpick_product_info_final_version_id', 'table_name'], keep=False)])
len1 = len(df_map[df_map.duplicated(subset=['mapped_id', 'table_name'], keep=False)])
if len0 + len1 == 0:
    print('\n\tComplete!!')


	Complete!!


In [377]:
def map_expand(mapping_table: pd.DataFrame) -> pd.DataFrame:
    mapping_table = mapping_table.reset_index(drop=True)
    
    mapping_ = pd.DataFrame(columns=['item_key', 'id', 'table_name'])
    
    for idx in tqdm(range(len(mapping_table))):
        id_0 = mapping_table.iloc[idx, 0]
        ids = mapping_table.iloc[idx, 1]
        tbl = mapping_table.iloc[idx, 2]
        
        if ids[0] == '[':
            ids = ast.literal_eval(ids)
            for id_1 in ids:
                mapping_.loc[len(mapping_)] = int(id_0), int(id_1), str(tbl)

        else:
            id_1 = ids
            mapping_.loc[len(mapping_)] = int(id_0), int(id_1), str(tbl)
    
    return mapping_

In [378]:
df_map_ = map_expand(df_map)


  0%|          | 0/18432 [00:00<?, ?it/s]

In [379]:
dup_col = ['id', 'table_name']

df_concat = pd.concat([df_1, df_map_.loc[:, dup_col]])
df_dedup = df_concat.drop_duplicates(subset=dup_col, keep=False)

In [380]:
if len(df_1) - len(df_map_) == len(df_dedup):
    print('complete!!')

complete!!


In [381]:
df_dedup.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 93859 entries, 0 to 117649
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                93859 non-null  object 
 1   brand_name        93859 non-null  object 
 2   product_name_old  93859 non-null  object 
 3   selection         93859 non-null  object 
 4   division          93859 non-null  object 
 5   groups            93858 non-null  object 
 6   table_name        93859 non-null  object 
 7   pk                93859 non-null  float64
 8   product_name      93859 non-null  object 
 9   category          79943 non-null  object 
dtypes: float64(1), object(9)
memory usage: 7.9+ MB


In [382]:
df_map_[df_map_.duplicated(subset=dup_col, keep=False)]

Unnamed: 0,item_key,id,table_name


In [None]:
mapped_prds = prd_mapper(df_0, df_1)

  0%|          | 0/6311 [00:00<?, ?it/s]

In [6]:
compared_df = pd.read_csv('compared_prds.csv')

In [12]:
mapped_df = select_mapped_prd(compared_df)

  0%|          | 0/692 [00:00<?, ?it/s]

In [20]:
grp_0 = mapped_df[mapped_df.similarity==1]

In [21]:
grp_1 = mapped_df[(mapped_df.similarity!=1) & (mapped_df.dependency_ratio==1)]

In [22]:
grp_2 = mapped_df[(mapped_df.similarity!=1) & (mapped_df.dependency_ratio!=1)]

In [26]:
len(grp_0) + len(grp_1) + len(grp_2)

45725

In [55]:
map_0 = md_map_tbl(grp_0)

  0%|          | 0/5414 [00:00<?, ?it/s]

  0%|          | 0/8217 [00:00<?, ?it/s]

  0%|          | 0/7270 [00:00<?, ?it/s]

  0%|          | 0/2038 [00:00<?, ?it/s]

In [57]:
map_0.sort_values(by='glowpick_product_info_final_version_id')

Unnamed: 0,glowpick_product_info_final_version_id,mapped_id,table_name
5416,34,11,naver_beauty_product_info_extended_v1_211217
5045,34,24792,naver_beauty_product_info_extended_v3_220124
5417,35,15,naver_beauty_product_info_extended_v1_211217
4158,77,20401,naver_beauty_product_info_extended_v3_220124
5422,88,"[46, 52]",naver_beauty_product_info_extended_v1_211217
...,...,...,...
22557,111537,"[18550, 18552, 18554]",naver_beauty_product_info_extended_v4_220311
13757,111543,1212,naver_beauty_product_info_extended_v2_211231
19099,111549,31084,naver_beauty_product_info_extended_v2_211231
22572,111549,18663,naver_beauty_product_info_extended_v4_220311


In [66]:
df_map

Unnamed: 0,glowpick_product_info_final_version_id,mapped_id,table_name
483,34,11,naver_beauty_product_info_extended_v1_211217
482,34,24792,naver_beauty_product_info_extended_v3_220124
484,35,15,naver_beauty_product_info_extended_v1_211217
515,77,20401,naver_beauty_product_info_extended_v3_220124
527,88,"[52, 46]",naver_beauty_product_info_extended_v1_211217
...,...,...,...
17346,111537,"[18550, 18552, 18554]",naver_beauty_product_info_extended_v4_220311
22825,111543,1212,naver_beauty_product_info_extended_v2_211231
17332,111549,31084,naver_beauty_product_info_extended_v2_211231
17331,111549,"[838, 672]",naver_beauty_product_info_extended_v3_220124
