In [1]:
import os
import re
import sys
from datetime import datetime

import pymysql
import requests
from tqdm.auto import tqdm

import pandas as pd

# if getattr(sys, 'frozen', False) and hasattr(sys, '_MEIPASS'):
#     root = sys._MEIPASS
# else:
#     cur_dir = os.path.dirname(os.path.realpath('__file__'))
#     root = os.path.abspath(os.path.join(cur_dir, os.pardir, os.pardir))
#     src  = os.path.join(root, 'src')
#     sys.path.append(src)

from database.access import AccessDatabase
from crawling.crawler import get_url, json_iterator
today = datetime.today().strftime('%y%m%d')
db_glamai = AccessDatabase('glamai')
db_jangho = AccessDatabase('jangho')

  from .autonotebook import tqdm as notebook_tqdm


---
### Sephora Product Update

---
#### 1st) Refinement

In [None]:
from sephora_product.refinement import Refinement
products = Refinement().update_refinement()

---
#### 2nd) Products By Subcategory

In [None]:
from sephora_product.product_keyword import ProductKeyword
upload_df = ProductKeyword().update_product_keyword()

---
#### 3rd) Update Vertical data

In [None]:
from sephora_product.vertical_data import VerticalData
VerticalData().update_vertical_data()

---
#### 4th) Update Best & New & Vegan & Organic

In [None]:
from sephora_product.best_new import UpdateBestSellerNew
from sephora_product.keywords import SephoraVeganOrganic
UpdateBestSellerNew().update_best_new()
SephoraVeganOrganic().update_keywords()

---
#### 5th) Review Date

In [None]:
from sephora_product.review_date import ReviewDate
new_product_list, data = ReviewDate().update_review_date()

---
#### 6th) Insert product info

In [None]:
from sephora_product.insert_product_info import update_product_info
result = update_product_info()

---
#### 7th) All product update

In [None]:
from sephora_product.all_product_update import update_all_product
data = update_all_product()

---
### Search Keywords Update

In [None]:
from sephora_keyword.search_keyword import update_search_keywords, db_distinction
total_df = update_search_keywords()
db_distinction()

---
### Sephora Review Update

---
#### 1st) Review Data

In [None]:
# # backup table
# table = 'sephora_txt_data_re'
# db_glamai._backup(table_name=table, keep=True)

In [None]:
# update review data
from sephora_review.review_data import ReviewData
txt_data, error = ReviewData()._crawling(backup=True)

In [None]:
columns = ['product_code', 'product_id', 'rating', 'skin_type', 'eye_color', 'skin_concerns', 'hair_color', 'skin_tone', 'age', 'title', 'txt_data', 'positive_count', 'write_time', 'regist_date']
rev_df = pd.DataFrame(txt_data, columns=columns)

error_df = pd.DataFrame(error, columns=['product_code', 'product_url', 'note'])
error_df_cnt = error_df.groupby('note').count()

rev_df.groupby('product_code').count()

print(\
f"product counts: {len(rev_df.product_code.unique())}\n\
product review counts: {len(rev_df)}\n\
reviews that already exist: {error_df_cnt.iloc[0, 0]}\n\
review does not exist: {error_df_cnt.iloc[1, 0]}")

---
#### 2nd) Review Date Update

In [None]:
from sephora_review.review_data import ReviewDate
result = ReviewDate().update_review_date()

---
#### 3rd) Duplicate check

In [None]:
'''
/* replace */ 
UPDATE sephora_txt_data_re SET txt_data = REPLACE(txt_data, '.Not impressed.', '.') WHERE BINARY(txt_data) LIKE '%Not impressed.';

/* check duplicated */
select product_code, txt_data, write_time, like_count, count(*) as cnt
from sephora_txt_data_re
group by product_code, txt_data, write_time
having cnt > 1;

/* dedup */

delete t1 
from sephora_txt_data_re t1, sephora_txt_data_re t2 
where 
t1.product_code=t2.product_code and 
t1.txt_data = t2.txt_data and 
t1.write_time =t2.write_time and
t1.like_count < t2.like_count;

delete t1 
from sephora_txt_data_re t1, sephora_txt_data_re t2
where 
t1.product_code = t2.product_code and 
t1.txt_data = t2.txt_data and
t1.write_time = t2.write_time and
t1.like_count = t2.like_count and
t1.pk < t2.pk;
'''

In [9]:

replace_query = "UPDATE sephora_txt_data_re SET txt_data = REPLACE(txt_data, '.Not impressed.', '.') WHERE BINARY(txt_data) LIKE '%Not impressed.';"
dedup_query_1 = '''
delete t1 
from sephora_txt_data_re t1, sephora_txt_data_re t2 
where 
t1.product_code=t2.product_code and 
t1.txt_data = t2.txt_data and 
t1.write_time =t2.write_time and
t1.like_count < t2.like_count;
'''
dedup_query_2 = '''
delete t1 
from sephora_txt_data_re t1, sephora_txt_data_re t2
where 
t1.product_code = t2.product_code and 
t1.txt_data = t2.txt_data and
t1.write_time = t2.write_time and
t1.like_count = t2.like_count and
t1.pk < t2.pk;'''

conn, curs = db_glamai._connect()
curs.execute(replace_query)
conn.commit()
curs.execute(dedup_query_1)
conn.commit()
curs.execute(dedup_query_2)
conn.commit()
curs.close()
conn.close()

In [10]:
# Check query

query = '''
select product_code, txt_data, write_time, like_count, count(*) as cnt
from sephora_txt_data_re
group by product_code, txt_data, write_time
having cnt > 1;'''
conn, curs = db_glamai._connect()
curs.execute(query)
data = curs.fetchall()
curs.close()
conn.close()

if len(data) == 0:
    print('Complete dedup!')
else:
    print('Dedup Failed!')

Complete dedup!


---
###


---
### Sephora Product Status 



In [2]:
from sephora_update.status import update_sephora_status

status_data_dict = {}
verticals = ['face_base', 'eye', 'lip_color', 'moisturizers', 'cheek', 'treatments', 'masks', 'eye_care', 'body_care', 'mens', 'fragrance_men', 'fragrance_women', 'wellness', 'cleansers']
for vertical in tqdm(verticals):
    status_data_df = update_sephora_status(vertical)
    status_data_dict[vertical] = status_data_df

  0%|          | 0/14 [00:00<?, ?it/s]



`face_base_product_info` Import Time: 0.3sec


100%|██████████| 11803/11803 [1:53:25<00:00,  1.73it/s]




`sephora_face_base_data_status` is backuped successful!
backup_table_name: sephora_face_base_data_status_bak_220928


  7%|▋         | 1/14 [1:53:28<24:35:04, 6808.05s/it]



Table Upload Success: `sephora_face_base_data_status`


`eye_product_info` Import Time: 0.2sec


100%|██████████| 6456/6456 [55:14<00:00,  1.95it/s]




`sephora_eye_data_status` is backuped successful!
backup_table_name: sephora_eye_data_status_bak_220928


 14%|█▍        | 2/14 [2:48:44<15:50:48, 4754.04s/it]



Table Upload Success: `sephora_eye_data_status`


`lip_color_product_info` Import Time: 0.2sec


 61%|██████▏   | 4362/7121 [4:42:48<2:58:52,  3.89s/it]
 14%|█▍        | 2/14 [7:31:33<45:09:20, 13546.71s/it]


KeyboardInterrupt: 

---
### Sephora Product Sale

In [3]:
from sephora_update.sales import update_sephora_sale

price_data_dict = {}
# verticals = ['face_base', 'eye', 'lip_color', 'moisturizers', 'cheek', 'treatments', 'masks', 'eye_care', 'body_care', 'mens', 'fragrance_men', 'fragrance_women', 'wellness', 'cleansers']
verticals = ['cheek', 'treatments', 'masks', 'eye_care', 'body_care', 'mens', 'fragrance_men', 'fragrance_women', 'wellness', 'cleansers']
for vertical in tqdm(verticals):
    price_data = update_sephora_sale(vertical)
    price_data_dict[vertical] = price_data

  0%|          | 0/10 [00:00<?, ?it/s]



`sephora_cheek_data_sale` is backuped successful!
backup_table_name: sephora_cheek_data_sale_bak_220929
cheek new product update 완료!


100%|██████████| 216/216 [02:25<00:00,  1.48it/s]
 10%|█         | 1/10 [02:27<22:04, 147.18s/it]

cheek product sale status update 완료!


`sephora_treatments_data_sale` is backuped successful!
backup_table_name: sephora_treatments_data_sale_bak_220929
treatments new product update 완료!


100%|██████████| 514/514 [05:25<00:00,  1.58it/s]
 20%|██        | 2/10 [07:53<33:39, 252.40s/it]

treatments product sale status update 완료!


`sephora_masks_data_sale` is backuped successful!
backup_table_name: sephora_masks_data_sale_bak_220929
masks new product update 완료!


100%|██████████| 166/166 [01:44<00:00,  1.59it/s]
 30%|███       | 3/10 [09:38<21:37, 185.33s/it]

masks product sale status update 완료!


`sephora_eye_care_data_sale` is backuped successful!
backup_table_name: sephora_eye_care_data_sale_bak_220929
eye_care new product update 완료!


100%|██████████| 184/184 [01:50<00:00,  1.66it/s]
 40%|████      | 4/10 [11:30<15:37, 156.24s/it]

eye_care product sale status update 완료!


`sephora_body_care_data_sale` is backuped successful!
backup_table_name: sephora_body_care_data_sale_bak_220929
body_care new product update 완료!


100%|██████████| 47/47 [00:28<00:00,  1.66it/s]
 50%|█████     | 5/10 [11:59<09:12, 110.40s/it]

body_care product sale status update 완료!


`sephora_mens_data_sale` is backuped successful!
backup_table_name: sephora_mens_data_sale_bak_220929
mens new product update 완료!


100%|██████████| 11/11 [00:07<00:00,  1.54it/s]
 60%|██████    | 6/10 [12:07<05:02, 75.54s/it] 

mens product sale status update 완료!


`sephora_fragrance_men_data_sale` is backuped successful!
backup_table_name: sephora_fragrance_men_data_sale_bak_220929
fragrance_men new product update 완료!


100%|██████████| 253/253 [02:44<00:00,  1.54it/s]
 70%|███████   | 7/10 [14:53<05:15, 105.00s/it]

fragrance_men product sale status update 완료!


`sephora_fragrance_women_data_sale` is backuped successful!
backup_table_name: sephora_fragrance_women_data_sale_bak_220929
fragrance_women new product update 완료!


100%|██████████| 693/693 [07:15<00:00,  1.59it/s]
 80%|████████  | 8/10 [22:10<07:01, 210.81s/it]

fragrance_women product sale status update 완료!


`sephora_wellness_data_sale` is backuped successful!
backup_table_name: sephora_wellness_data_sale_bak_220929
wellness new product update 완료!


100%|██████████| 125/125 [01:16<00:00,  1.63it/s]
 90%|█████████ | 9/10 [23:27<02:49, 169.11s/it]

wellness product sale status update 완료!


`sephora_cleansers_data_sale` is backuped successful!
backup_table_name: sephora_cleansers_data_sale_bak_220929
cleansers new product update 완료!


100%|██████████| 397/397 [04:00<00:00,  1.65it/s]
100%|██████████| 10/10 [27:29<00:00, 164.95s/it]

cleansers product sale status update 완료!





In [None]:
# verticals = ['face_base', 'eye', 'lip_color', 'moisturizers', 'cheek', 'treatments', 'masks', 'eye_care', 'body_care', 'mens', 'fragrance_men', 'fragrance_women', 'wellness', 'cleansers']

# def update_sephora_sale_v(vertical):
#     # backup table
#     table_name = f'sephora_{vertical}_data_sale'
#     db_glamai._backup(table_name=table_name, keep=True)
    
#     from sephora_update.sales import UpdateProductSale
#     sale = UpdateProductSale()
    
#     sale.__conn__()
#     product_codes = sale.get_data(vertical)
#     sale.insert_data_new(vertical)

#     status_info, price_datas = [], []
#     for product_code in tqdm(product_codes):
#         price_data, status = sale.update_data(product_code, vertical)
        
#         status_info.append([product_code, status])
#         price_datas += price_data
#     sale.__close__()
#     print(f'{vertical} product status update 완료!')

#     return price_data

---
### Affiliate price

---
#### Amazon update

In [None]:
from affiliate.amazon import get_data, _crawling, _upload

In [None]:
df_amazon = get_data()
datas, error = [], []
for value in tqdm(df_amazon.values):
    data = _crawling(value)
    if data is None:
        affiliate_url = value[3]
        error.append(affiliate_url)
    else:
        datas.append(data)
crawling_df, upload_df = _upload(datas)

In [None]:
# upload_df = pd.read_csv('/Users/yeonseosla/Downloads/amazon_data.csv')
upload_df = db_jangho.get_tbl('')
upload_df.groupby('is_use').count()

In [None]:
upload_df[upload_df.is_use==1].groupby('is_sale').count()

In [None]:
# _upload_df = upload_df.copy()

# _upload_df.loc[_upload_df.price < _upload_df.sale_price, 'price'] = _upload_df.loc[_upload_df.price < _upload_df.sale_price, 'sale_price']
# _upload_df.loc[(_upload_df.price==_upload_df.sale_price) & (_upload_df.price!=0), ['is_sale', 'is_use']] = [0, 1]
# _upload_df.loc[_upload_df.is_use==-1, ['is_sale', 'is_use']] = [0, 0]

In [None]:
# _upload_df.groupby('is_use').count()
# _upload_df[_upload_df.is_use==1].groupby('is_sale').count()
# _upload_df.loc[(_upload_df.is_use==1)& (_upload_df.is_sale==0) & (_upload_df.sale_price==0)]

In [None]:
# db_jangho.create_table(upload_df=_upload_df, table_name='affiliate_price_update_amazon')

In [None]:
# # sale_price == price

# today = '220914'
# # today = db_jangho.today
# query = f'''
# update jangho.affiliate_price_update_amazon_{today} as a
# join jangho.affiliate_price_update_amazon_{today} as b
# on a.product_code = b.product_code and a.item_no = b.item_no and a.affiliate_type = b.affiliate_type
# set a.sale_price = b.price
# where a.is_sale = 0 and a.is_use = 1 and a.sale_price = 0;'''

# conn, curs = db_jangho._connect()
# curs.execute(query)
# conn.commit()
# curs.close()
# conn.close()

In [None]:
# # is_use = 1

# today = '220914'
# # today = db_jangho.today
# query = f'''
# update affiliate_price_update_amazon_{today}
# set is_use = 1
# where is_use = 0 and price != 0;'''

# conn, curs = db_jangho._connect()
# curs.execute(query)
# conn.commit()
# curs.close()
# conn.close()

In [2]:
# today = db_jangho.today
today = '221006'
query = f'''
update glamai.affiliate_price as a
join jangho.affiliate_price_update_amazon_{today} as b 
on a.product_code = b.product_code and a.item_no = b.item_no and a.affiliate_type = b.affiliate_type
set a.price = b.price, a.sale_price = b.sale_price, a.is_sale = b.is_sale, a.is_use = b.is_use, a.regist_date = b.regist_date, a.update_date = b.update_date;'''

conn, curs = db_jangho._connect()
curs.execute(query)
conn.commit()
curs.close()
conn.close()

In [3]:
# Check query

query = 'select * from affiliate_price where is_use=1 and is_sale=0 and sale_price=0;'
conn, curs = db_glamai._connect()
curs.execute(query)
data = curs.fetchall()
print(data)
curs.close()
conn.close()

()


---
#### Ulta update

In [None]:
from affiliate.ulta import get_data, _crawling, _upload

In [None]:
df_ulta = get_data()
datas, error = [], []
for value in tqdm(df_ulta.values):
    data = _crawling(value)
    if data is None:
        affiliate_url = value[3]
        error.append(affiliate_url)
    else:
        datas.append(data)
crawling_df, upload_df = _upload(datas)

In [None]:
db_jangho.create_table(upload_df=upload_df, table_name='affiliate_price_update_ulta')

In [None]:
# today = '220913'
# # today = db_jangho.today
# query = f'''
# update jangho.affiliate_price_update_ulta_{today} as a
# join jangho.affiliate_price_update_ulta_{today} as b
# on a.product_code = b.product_code and a.item_no = b.item_no and a.affiliate_type = b.affiliate_type
# set a.sale_price = b.price
# where a.is_sale = 0 and a.is_use = 1 and a.sale_price = 0;'''

# conn, curs = db_jangho._connect()
# curs.execute(query)
# conn.commit()
# curs.close()
# conn.close()

In [None]:
# today = '220913'
# # today = db_jangho.today
# query = f'''
# update affiliate_price_update_ulta_{today}
# set is_use = 1
# where is_use = 0 and price != 0;'''

# conn, curs = db_jangho._connect()
# curs.execute(query)
# conn.commit()
# curs.close()
# conn.close()

In [4]:
# today = db_jangho.today
today = '221006'
query = f'''
update glamai.affiliate_price as a
join jangho.affiliate_price_update_ulta_{today} as b 
on a.product_code = b.product_code and a.item_no = b.item_no and a.affiliate_type = b.affiliate_type
set a.price = b.price, a.sale_price = b.sale_price, a.is_sale = b.is_sale, a.is_use = b.is_use, a.regist_date = b.regist_date, a.update_date = b.update_date;'''

conn, curs = db_jangho._connect()
curs.execute(query)
conn.commit()
curs.close()
conn.close()


In [5]:
# Check query

query = 'select * from affiliate_price where is_use=1 and is_sale=0 and sale_price=0;'
conn, curs = db_glamai._connect()
curs.execute(query)
data = curs.fetchall()
print(data)
curs.close()
conn.close()

()


---
### Table Upload 

In [48]:
# glamai_youtube_urls
_date = '220926'
df = pd.read_csv(f'/Users/yeonseosla/Downloads/glamai_youtube_total_{_date}_final.csv').iloc[:, 1:]

In [56]:
yt_url = 'https://www.youtube.com/watch?v=3vpMAqQIs00'
shorts = 'https://www.youtube.com/shorts/tvY8dOnO82s'

df_ = df.copy()
df_.loc[df_.yt_url.str[:9]=='/watch?v=', 'yt_id'] = df_.yt_url.str[9:]
df_.loc[df_.yt_url.str[:9]!='/watch?v=', 'yt_id'] = df_.yt_url.str[8:]
df_.loc[:, 'yt_url'] = 'https://www.youtube.com' + df_.loc[:, 'yt_url']
df_.loc[df_.yt_url.str.contains('shorts'), 'duration'] = 'SHORTS'

In [83]:
df_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24342 entries, 0 to 24341
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_code  24342 non-null  object
 1   thumbnail     24342 non-null  object
 2   title         24342 non-null  object
 3   yt_url        24342 non-null  object
 4   duration      6523 non-null   object
 5   youtuber      24342 non-null  object
 6   yt_id         24342 non-null  object
dtypes: object(7)
memory usage: 1.3+ MB


In [92]:
glamai_youtube_urls = db_glamai.get_tbl('glamai_youtube_urls').iloc[:, 1:]



`glamai_youtube_urls` Import Time: 2.8sec


In [93]:
# set yt_id 
glamai_youtube_urls.loc[glamai_youtube_urls.yt_url.str.contains('watch'), 'yt_id'] = glamai_youtube_urls.yt_url.str[32:]
glamai_youtube_urls.loc[glamai_youtube_urls.yt_url.str.contains('shorts'), 'yt_id'] = glamai_youtube_urls.yt_url.str[31:]

In [108]:
subset = ['product_code', 'thumbnail', 'title', 'yt_id', 'duration', 'youtuber']
by = subset + ['regist_date']
df_.loc[:, 'regist_date'] = datetime.now()
df_concat = pd.concat([glamai_youtube_urls, df_])
df_sorted = df_concat.sort_values(by=by, ascending=False)
df_dedup = df_sorted.drop_duplicates(subset=subset, keep='first', ignore_index=True)

In [111]:
db_glamai.create_table(df_dedup, 'glamai_youtube_urls')



`glamai_youtube_urls` is backuped successful!
backup_table_name: glamai_youtube_urls_bak_220929


Table Upload Success: `glamai_youtube_urls`
