In [1]:
import os
import re
import sys
import time 
import html
from datetime import datetime

import pymysql
import requests
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException

import pandas as pd

if getattr(sys, 'frozen', False) and hasattr(sys, '_MEIPASS'):
    root = sys._MEIPASS
else:
    root = os.path.dirname(os.path.realpath('__file__'))
    src  = os.path.join(root, 'src')
    sys.path.append(src)

from database.access import AccessDatabase
from crawling.crawler import get_url, json_iterator
db_glamai = AccessDatabase('glamai')
db_jangho = AccessDatabase('jangho')

  from .autonotebook import tqdm as notebook_tqdm


---
### Sephora sale update

In [None]:
class UpdateProductSale:
    def __init__(self):
        self.__conn__()
        
    def __conn__(self):
        self.ds = AccessDatabase('glamai')
        self.ds_conn, self.ds_cur = self.ds._connect()
        
    def __close__(self):
        self.ds_conn.commit()
        self.ds_cur.close()
        self.ds_conn.close()

    def insert_data_new(self, vertical):
        # insert new product
        query = f'''
                    insert into sephora_{vertical}_data_sale (product_code, item_no, list_price, regist_date)
                    select product_code, item_no, price, regist_date from sephora_{vertical}_data_status
                    where is_use=1 and (product_code, item_no) not in (select product_code, item_no from sephora_{vertical}_data_sale);
                '''
        while True:
            try:
                self.ds_cur.execute(query)
                self.ds_conn.commit()
                print(f'{vertical} new product update 완료!')
                break
            except:
                time.sleep(100)
                self.__close__()
                self.__conn__()
                print("DB 연결 끊김 ... 재연결 성공!")
        
    def get_data(self, vertical):
        table = f'sephora_{vertical}_data_status'
        query = f'select distinct(product_code) from {table} where is_use=1;'
        
        while True:
            try:   
                self.ds_cur.execute(query)
                product_codes = list(self.ds_cur.fetchall())
                break
            except:
                time.sleep(100)
                self.__close__()
                self.__conn__()
                print("DB 연결 끊김 ... 재연결 성공!")
        
        return product_codes

    def scraper_price(self, data, product_code):
        try:
            current_sku = str(data['currentSku'])
            status = 1
        except KeyError:
            status = 0

        if status == 1:
            listPrice_pattern = r"\'listPrice\': \'\$[0-9]{0,5}.[0-9]{0,5}"
            listPrice_re = re.search(listPrice_pattern, current_sku)
            if listPrice_re is None:
                listPrice = float(0)
            else:
                listPrice = listPrice_re.group()
                listPrice = float(listPrice.split(":")[1].replace("'", "").replace(" ", "").replace("$",""))
                
            salePrice_pattern = r"\'salePrice\': \'\$[0-9]{0,5}.[0-9]{0,5}"
            salePrice_re = re.search(salePrice_pattern, current_sku)
            if salePrice_re is None:
                salePrice = float(0)
                is_sale = 0
            else:
                salePrice = salePrice_re.group()
                salePrice = float(salePrice.split(":")[1].replace("'", "").replace(" ", "").replace("$",""))
                is_sale = 1
                
            skuId_pattern = r"\'skuId\': \'[0-9]{5,10}"
            skuId_re = re.search(skuId_pattern, current_sku)
            if skuId_re is None:
                skuId = None
            else:
                skuId = skuId_re.group()
                item_no = int(skuId.split(":")[1].replace("'", "").replace(" ", ""))
                
            return [listPrice, salePrice, is_sale, datetime.now(), product_code, item_no]
        
        else:
            return None
        
    def scraper_price_sale(self, data, product_code):
        on_sale_sku = data.get("onSaleChildSkus")
        on_sale_sku_text = str(on_sale_sku)

        listPrice_pattern = r"\'listPrice\': \'\$[0-9]{0,5}.[0-9]{0,5}"
        salePrice_pattern = r"\'salePrice\': \'\$[0-9]{0,5}.[0-9]{0,5}"
        skuId_pattern = r"\'skuId\': \'[0-9]{5,10}"

        listPrice_list = re.findall(listPrice_pattern, on_sale_sku_text)
        salePrice_list = re.findall(salePrice_pattern, on_sale_sku_text)
        skuId_list = re.findall(skuId_pattern, on_sale_sku_text)
        
        if len(skuId_list) == 0:
            return None
        elif len(listPrice_list) == len(salePrice_list) and len(listPrice_list) == len(skuId_list):
            scraped_data = []
            i = 0
            for skuIds in skuId_list:
                list_prices = listPrice_list[i]
                sale_prices = salePrice_list[i]
                
                item_no = int(skuIds.split(":")[1].replace("'", "").replace(" ", ""))
                list_price = float(list_prices.split(":")[1].replace("'", "").replace(" ", "").replace("$",""))
                sale_price = float(sale_prices.split(":")[1].replace("'", "").replace(" ", "").replace("$",""))
                is_sale = 1
                i += 1
                scraped_data.append([list_price, sale_price, is_sale, datetime.now(), product_code, item_no])
            return scraped_data
        else:
            return None
        
    def insert_data(self, data, vertical):
        # update product sales
        query = f'''
                update sephora_{vertical}_data_sale set list_price = %s, sale_price = %s, is_sale = %s, update_date = %s
                where product_code = %s and item_no = %s;
            '''
        while True:
            try:
                self.ds_cur.execute(query, data)
                self.ds_conn.commit()
                break
            except:
                time.sleep(100)
                self.__close__()
                self.__conn__()
                print("DB 연결 끊김 ... 재연결 성공!")
    
    def update_data(self, product_code, vertical):
        product_code = product_code['product_code']
        price_data = []
        url = f'https://www.sephora.com/api/catalog/products/{product_code}?preferedSku=&includeConfigurableSku=true&passkey=caQ0pQXZTqFVYA1yYnnJ9emgUiW59DXA85Kxry8Ma02HE'
        res_data = json_iterator(url)
        status = 0
        if res_data is None:
            status -= 1
        else:
            scraped_data = self.scraper_price(res_data, product_code)
            if scraped_data is None:
                status -= 1
            else:
                self.insert_data(scraped_data, vertical)
                price_data.append(scraped_data)
                status += 1
                
            scraped_datas = self.scraper_price_sale(res_data, product_code)
            if scraped_datas is None:
                status -= 1
            else:
                for scraped_data in scraped_datas:
                    self.insert_data(scraped_data, vertical)
                    price_data.append(scraped_data)
                status += 1
        
        return price_data, status
    
    
def main():
    sale = UpdateProductSale()
    verticals = ['face_base', 'eye', 'lip_color', 'moisturizers', 'cheek', 'treatments', 'masks', 'eye_care', 'body_care', 'mens', 'fragrance_men', 'fragrance_women', 'wellness', 'cleansers']
    price_data_dict = {}
    for vertical in tqdm(verticals):
        sale.__conn__()
        product_codes = sale.get_data(vertical)
        sale.insert_data_new(vertical)

        status_info, price_datas = [], []
        for product_code in tqdm(product_codes):
            price_data, status = sale.update_data(product_code, vertical)
            
            status_info.append([product_code, status])
            price_datas += price_data
        
        price_data_dict[vertical] = price_datas
        sale.__close__()
        print(f'{vertical} product status update 완료!')
        
    return price_data_dict
                    

In [None]:
sale = UpdateProductSale()
verticals = ['face_base', 'eye', 'lip_color', 'moisturizers', 'cheek', 'treatments', 'masks', 'eye_care', 'body_care', 'mens', 'fragrance_men', 'fragrance_women', 'wellness', 'cleansers']
price_data_dict = {}
for vertical in tqdm(verticals):
    sale.__conn__()
    product_codes = sale.get_data(vertical)
    sale.insert_data_new(vertical)

    status_info, price_datas = [], []
    for product_code in tqdm(product_codes):
        price_data, status = sale.update_data(product_code, vertical)
        
        status_info.append([product_code, status])
        price_datas += price_data
    
    price_data_dict[vertical] = price_datas
    sale.__close__()
    print(f'{vertical} product status update 완료!')

In [None]:
# df_test = db_glamai.get_tbl('sephora_cleansers_data_sale')
# df_status_test = db_glamai.get_tbl('sephora_cleansers_data_status')

# db_glamai.engine_upload(upload_df=df_test, table_name='sephora_test_data_sale', if_exists_option='replace')
# db_glamai.engine_upload(upload_df=df_status_test, table_name='sephora_test_data_status', if_exists_option='replace')

---
### Product update: sephora_vertical_data

In [None]:
def get_price(raw_price):
    if '(' in raw_price:
        price = raw_price[:raw_price.find('(')]
    else:
        price = raw_price.split(' ')[0]
    return price


def get_sku_info(sku_data):
        result = {}
        main_url = 'https://www.sephora.com'
        image_list = sku_data.get('alternateImages')
        if image_list:
            result['swatch'] = 'https://www.sephora.com' + image_list[0]['image450']
        else:
            result['swatch'] = None
        color_value = sku_data.get('variationValue')
        color_desc = sku_data.get('variationDesc')
        color = []
        if color_value:
            color.append(color_value)
        if color_desc:
            color.append(color_desc)
        color = ' - '.join(color)

        result['color'] = color
        price = sku_data['listPrice'][1:]
        result['item_no'] = sku_data['skuId']
        result['price'] = get_price(price)
        result['size'] = sku_data.get('size')
        result['item_no'] = sku_data.get('skuId', 0)
        result['url'] = main_url + sku_data.get('targetUrl')
        result['max_purchase_quantity'] = sku_data.get('maxPurchaseQuantity', 0)
        result['bigvisual'] = main_url + sku_data['skuImages']['image450']
        result['palette'] = sku_data.get('smallImage')
        result['ingredients'] = sku_data.get('ingredientDesc')
        result['free_shipping'] = sku_data.get('isFreeShippingSku', 0)
        result['gift_wrappable'] = sku_data.get('isGiftWrappable', 0)
        result['limited_edition'] = sku_data.get('isLimitedEdition', 0)
        result['limited_quantity'] = sku_data.get('isLimitedQuantity', 0)
        result['limited_time_offer'] = sku_data.get('isLimitedTimeOffer', 0)
        result['natural_organic'] = sku_data.get('isNaturalOrganic', 0)
        result['online_only'] = sku_data.get('isOnlineOnly', 0)
        result['sephora_exclusive'] = sku_data.get('isSephoraExclusive', 0)
        result['out_of_stock'] = sku_data.get('is_out_of_stock', 0)
        result['max_purchase_quantity'] = sku_data.get('maxPurchaseQuantity', 0)
        result['paypal_restrict'] = sku_data.get('isPaypalRestricted', 0)

        return result


def get_product_info(product_code): # 100개 정도
        url = f'https://www.sephora.com/api/catalog/products/{product_code}?preferedSku=&includeConfigurableSku=true'
        # response = requests.get(url, headers=self.get_headers())
        # if 'errorCode' in response.text:
        #     print(product_code, 'error')
        #     time.sleep(random.randint(5,10))
        #     response = requests.get(url, headers=self.get_headers())
        #     if 'errorCode' in response.text:
        #         print(product_code, 'repeated error')
        #         return None
        product_list = []
        global ymal_sku
        global result
        global response_data
        
        response_data = json_iterator(url)
        if response_data is None:
            print(product_code, 'is None')
            return None
        elif 'errorCode' in response_data.text:
            print(product_code, 'is None')
            return None
        else:
            current_sku = response_data.get('currentSku')
            if current_sku is None:
                pass
            else:
                status = 1
                try:
                    result = get_sku_info(current_sku)
                except Exception as e:
                    status = 0
                    print(e, product_code)
        
        if status == 1:
            ymal_sku = response_data.get('ymalSkus')
            if ymal_sku is None:
                use_with = ''
            else:
                use_with = []
                for ymal in ymal_sku:
                    use_with.append(ymal['productId'])
                use_with = ','.join(use_with)
            result['use_with'] = use_with
            
            # brand
            try:
                result['brand'] = response_data.get('brand')['displayName']
            except Exception as e:
                result['brand'] = ''
                print(e, product_code, '** brand is None **')

            result['product_code'] = product_code
            result['is_sale'] = response_data.get('onSaleSku', 0)
            result['main_sku_check'] = 1
            result['like_count'] = response_data.get('lovesCount', 0)
            result['review_count'] = response_data.get('reviews', 0)
            result['what_it_is'] = response_data.get('quickLookDescription')
            rating = response_data.get('rating', 0)
            result['rating'] = round(rating * 2) / 2
            result['product_name'] = response_data.get('displayName')
            
            # details 부분 추가
            result['details'] = response_data.get('longDescription')
            if response_data.get('longDescription') is None:
                result['details'] = ""
            else:
                result['details'] = html.unescape(result['details']).replace("<b>", "").replace("</b>", "").replace("<br>", "").replace("</br>", "").replace("\r", "").replace("\n", "").replace("\t", "")
                result['details'] = re.sub('<.+?>', '', result['details'])
                
            product_list.append(result)
        
        # regularChildSkus
        sku_list = response_data.get('regularChildSkus')
        if sku_list is None:
            sku_list = response_data.get('onSaleChildSkus', [])

        for sku in sku_list:
            status = 1
            try:
                result = get_sku_info(sku)
            except Exception as e:
                status = 0
                print(e, product_code)
            if status == 1:
                result['product_code'] = product_code
                # brand
                try:
                    result['brand'] = response_data.get('brand')['displayName']
                except Exception as e:
                    result['brand'] = ''
                    print(e, product_code, '** brand is None **')
                result['is_sale'] = response_data.get('onSaleSku', 0)
                result['main_sku_check'] = 0
                try:
                    result['use_with'] = use_with
                except:
                    pass
                result['like_count'] = response_data.get('lovesCount', 0)
                result['review_count'] = response_data.get('reviews', 0)
                result['what_it_is'] = response_data.get('quickLookDescription')
                rating = response_data.get('rating', 0)
                result['rating'] = round(rating * 2) / 2
                result['product_name'] = response_data.get('displayName')
                
                # details 부분 추가
                result['details'] = response_data.get('longDescription')
                if response_data.get('longDescription') is None:
                    result['details'] = ""
                else:
                    result['details'] = html.unescape(result['details']).replace("<b>", "").replace("</b>", "").replace("<br>", "").replace("</br>", "").replace("\r", "").replace("\n", "").replace("\t", "")
                    result['details'] = re.sub('<.+?>', '', result['details'])
                product_list.append(result)

        return product_list

In [None]:
from sephora_product.vertical_data import VerticalData
vd = VerticalData()
vertical = 'eye'
remain_product = vd.get_vertical_product(vertical)
products = vd.get_product_list(vertical)

In [None]:
error = {}
error_, _error, product_infos = [], [], []
for product in tqdm(products[:200]):
    product_code = product['product_code']
    product_info = get_product_info(product_code)
    product_infos += product_info
    
    
    # url = f'https://www.sephora.com/api/catalog/products/{product_code}?preferedSku=&includeConfigurableSku=true'
    # response_data = json_iterator(url)
    
    # status = 1
    # try:        
    #     ymalSkus_ = response_data.get('ymalSkus')
    # except KeyError:
    #     status -= 1
    #     error_.append(url)
    # try:
    #     _ymalSkus = response_data['ymalSkus']
    # except KeyError:
    #     status -= 1
    #     _error.append(url)
    # if status == 1:
    #     product_infos.append([ymalSkus_, _ymalSkus])
    # error[product_code] = status
        

In [None]:
# info_df = pd.DataFrame(product_infos)
info_df.info()

In [None]:
info_df.drop_duplicates()

---
### Sephora product status 

In [None]:
from sephora_update.status import *

In [None]:
# verticals = ['face_base', 'eye', 'lip_color', 'moisturizers', 'cheek', 'treatments', 'masks', 'eye_care', 'body_care', 'mens', 'fragrance_men', 'fragrance_women', 'wellness', 'cleansers']
# for vertical in tqdm(verticals):
#     table_name = f'{vertical}_product_info'
#     info_df = db_glamai.get_tbl(table_name, ['product_code', 'item_no', 'url', 'price', 'regist_date'])
#     info_df_dedup = info_df.drop_duplicates(subset=['product_code', 'item_no'], keep='first')
#     info_status = []
#     for info in tqdm(info_df_dedup.values):
#         product_code = info[0]
#         item_no = info[1]
#         url = info[2]
#         price_org = info[3]
#         regist_date = info[4]
#         price, is_use = get_status(url, item_no)
#         update_date = datetime.today()
#         if price is None:
#             price = price_org
#         info_status.append([product_code, item_no, url, price, is_use, regist_date, update_date])
        
#     upload_table = f'sephora_{vertical}_data_status'
#     columns = ['product_code', 'item_no', 'url', 'price', 'is_use', 'regist_date', 'update_date']
#     upload_df = pd.DataFrame(info_status, columns=columns)
    
#     db_glamai.create_table(upload_df=upload_df, table_name=upload_table)

In [None]:
### Test

# vertical = 'test'
# table_name = f'{vertical}_product_info'
# info_df = db_glamai.get_tbl(table_name, ['product_code', 'item_no', 'url', 'price', 'regist_date'])
# info_df_dedup = info_df.drop_duplicates(subset=['product_code', 'item_no'], keep='first')
# info_status = []
# for info in tqdm(info_df_dedup.values):
#     product_code = info[0]
#     item_no = info[1]
#     url = info[2]
#     price_org = info[3]
#     regist_date = info[4]
#     price, is_use = get_status(url, item_no)
#     update_date = datetime.today()
#     if price is None:
#         price = price_org
#     info_status.append([product_code, item_no, url, price, is_use, regist_date, update_date])
    
# upload_table = f'sephora_{vertical}_data_status'
# columns = ['product_code', 'item_no', 'url', 'price', 'is_use', 'regist_date', 'update_date']
# upload_df = pd.DataFrame(info_status, columns=columns)
# db_glamai.create_table(upload_df, upload_table)

In [None]:
### Test

# test_df = db_glamai.get_tbl('face_base_product_info')
# test_product_info = test_df.sample(50)
# db_glamai.engine_upload(test_product_info, 'test_product_info', if_exists_option="replace")

---
### Search keywords

In [None]:
from sephora_keyword.search_keyword import update_search_keywords, db_distinction

In [None]:
from sephora_keyword.search_keyword import update_search_keywords, db_distinction
total_df = update_search_keywords()
db_distinction()

In [None]:
search_keyowrds_df = db_glamai.get_tbl('glamai_search_keywords')
search_keyowrds_df.info()

In [None]:
search_keyowrds_df = db_glamai.get_tbl('glamai_search_keywords')
search_keyowrds_df.info()

In [None]:
search_keyowrds_df = db_glamai.get_tbl('glamai_search_keywords')
search_keyowrds_df.info()

---
### Affiliate Price

---
#### Amazon 

In [None]:
# def get_data_amazon(url):

#     # wd = get_url(url)
#     wd = get_url(url)
#     soup = BeautifulSoup(wd.page_source, 'lxml')
#     if soup is None:
#         print("soup is None")
#         wd.quit()
#         return None
#     else:    
#         # Check page status
#         if soup.find('div', {'id': 'g'}) is None:
#             page_status = 1
#         else:
#             page_status = 0
#             avaliability_txt, price = None, None
#             price_normal, price_sale, is_sale, is_use = 0, 0, 0, 0
        
#         if page_status == 1:
#             # Check currently unavailable
#             if soup.find('div', {'id': 'availability'}) is None:
#                 avaliability_txt = None
#             else:
#                 avaliability = soup.find('div', {'id': 'availability'})
#                 avaliability_txt = avaliability.find('span').text.strip()

#             # Check price
#             if soup.find('div', 'a-section a-spacing-none aok-align-center') is not None:
#                 price_area = soup.find('div', 'a-section a-spacing-none aok-align-center')
#                 if price_area.find('span', 'a-offscreen') is None:
#                     price = None
#                     price_normal, price_sale, is_sale, is_use = 0, 0, 0, 0
#                 else:
#                     is_use = 1
#                     price = price_area.find('span', 'a-offscreen').text
#                     price_sale = round(float(price[1:]), 2)
#                     if soup.find('span', 'a-size-small a-color-secondary aok-align-center basisPrice') is None:
#                         price_normal = price_sale
#                         price_sale = 0
#                         is_sale = 0
#                     else:
#                         price_area = soup.find('span', 'a-size-small a-color-secondary aok-align-center basisPrice')
#                         if price_area.find('span', 'a-offscreen') is None:
#                             price_normal = price_sale
#                             price_sale = 0
#                             is_sale = 0
#                         else:
#                             price = price_area.find('span', 'a-offscreen').text
#                             price_normal = round(float(price[1:]), 2)   
#                             if price_normal > price_sale:
#                                 is_sale = 1
#                             else:
#                                 is_sale = -1
                                
#             elif soup.find('div', {'id': 'corePrice_desktop'}) is not None:
#                 price = None
#                 price_area = soup.find('div', {'id': 'corePrice_desktop'})
                
#                 is_sale, is_use = 0, 0
#                 try:
#                     price = price_area.find('span', 'a-price a-text-price a-size-base').find('span', 'a-offscreen').text
#                     price_normal = round(float(price[1:]), 2)
#                     is_use = 1
#                 except:
#                     price_normal = 0
                    
#                 try:
#                     price = price_area.find('span', 'a-price a-text-price a-size-medium apexPriceToPay').find_all('span', 'a-offscreen')[0].text
#                     price_sale = round(float(price[1:]), 2)
#                 except:
#                     price_sale = 0
                    
#                 if price_normal > price_sale:
#                     is_sale = 1
#                 else:
#                     price_normal = price_sale
#                     price_sale = 0
    
#             else:
#                 price_normal, price_sale, is_sale, is_use = 0, 0, 0, 0
                

#         wd.quit()
#         return [url, page_status, avaliability_txt, price_normal, price_sale, is_sale, is_use, price]

# def get_data():
#     df_price = db_glamai.get_tbl('affiliate_price')
#     df_amazon = df_price[df_price.affiliate_type=='amazon']
    
#     return df_amazon
    
# def _crawling(value):
#     product_code = value[0]
#     item_no = value[1]
#     affiliate_type = 'amazon'
#     affiliate_url = value[3]
#     affiliate_image = value[4]
#     regist_date = value[9]
    
#     data = get_data_amazon(affiliate_url)
#     if data is None:
#         pass
#     else:
#         data = [product_code, item_no, affiliate_type, affiliate_image, regist_date] + data
        
#     return data

# def upload_data(data):
#     columns = ['product_code', 'item_no', 'affiliate_type', 'affiliate_image', 'regist_date', 'affiliate_url', 'page_status', 'avaliability_txt', 'price', 'sale_price', 'is_sale', 'is_use', 'price_']
#     crawling_df = pd.DataFrame(data, columns=columns)

#     upload_columns = ['product_code', 'item_no', 'affiliate_type', 'affiliate_url', 'affiliate_image',  'price', 'sale_price', 'is_sale', 'is_use', 'regist_date']
#     upload_df = crawling_df.loc[:, upload_columns]
#     upload_df.loc[:, 'update_date'] = datetime.today()
#     # db_jangho.create_table(upload_df=upload_df, table_name='affiliate_price_update_amazon')
    
#     return crawling_df, upload_df

# def main():
#     df_amazon = get_data()
#     datas, error = [], []
#     for value in tqdm(df_amazon.values):
#         data = _crawling(value)
#         if data is None:
#             affiliate_url = value[3]
#             error.append(affiliate_url)
#         else:
#             datas.append(data)
#     crawling_df, upload_df = upload_data(datas)

In [None]:
# df_amazon = get_data()
# datas, error = [], []
# for value in tqdm(df_amazon.values):
#     data = _crawling(value)
#     if data is None:
#         affiliate_url = value[3]
#         error.append(affiliate_url)
#     else:
#         datas.append(data)
# crawling_df, upload_df = upload_data(datas)

In [None]:
# df.groupby('avaliability_txt').count()
# urls = df[(df.avaliability_txt != 'Currently unavailable.') & (df.price_.isnull())].affiliate_url.unique().tolist()

# datas = []
# for url in tqdm(urls):
#     wd = get_url(url)
#     soup = BeautifulSoup(wd.page_source, 'lxml')
#     if soup is None:
#         print("soup is None", url)
#         wd.quit()
#     else:    
#         price = None
        
#         if soup.find('div', {'id': 'corePrice_desktop'}) is None:
#             price_normal = 0
#         else:
#             price_area = soup.find('div', {'id': 'corePrice_desktop'})
#             try:
#                 price = price_area.find('span', 'a-price a-text-price a-size-base').find('span', 'a-offscreen').text
#                 price_normal = round(float(price[1:]), 2)
#             except:
#                 price_normal = 0
            
#             try:
#                 price = price_area.find('span', 'a-price a-text-price a-size-medium apexPriceToPay').find_all('span', 'a-offscreen')[0].text
#                 price_sale = round(float(price[1:]), 2)
#             except:
#                 price_sale = 0
                
#         datas.append([url, price_normal, price_sale, price]) 

# _df = pd.DataFrame(datas, columns=['url', 'price', 'sale', 'price_'])

In [2]:
from affiliate.amazon import get_data, _crawling, _upload
df_amazon = get_data()
datas, error = [], []
for value in tqdm(df_amazon.values):
    data = _crawling(value)
    if data is None:
        affiliate_url = value[3]
        error.append(affiliate_url)
    else:
        datas.append(data)
crawling_df, upload_df = _upload(datas)
# db_jangho.create_table(upload_df=upload_df, table_name='affiliate_price_update_amazon')



`affiliate_price` Import Time: 1.0sec


[WDM] - Downloading: 100%|██████████| 8.15M/8.15M [00:00<00:00, 12.5MB/s]
 61%|██████▏   | 2809/4584 [2:22:29<1:46:36,  3.60s/it] 



Error: Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: headless chrome=105.0.5195.125)
Stacktrace:
0   chromedriver                        0x00000001045ef788 chromedriver + 4515720
1   chromedriver                        0x00000001045739d3 chromedriver + 4008403
2   chromedriver                        0x000000010420612a chromedriver + 413994
3   chromedriver                        0x00000001041ffd8e chromedriver + 388494
4   chromedriver                        0x00000001041f34ad chromedriver + 337069
5   chromedriver                        0x00000001041f4370 chromedriver + 340848
6   chromedriver                        0x00000001041f3718 chromedriver + 337688
7   chromedriver                        0x00000001041f2bd4 chromedriver + 334804
8   chromedriver                        0x00000001041f1a87 chromedriver + 330375
9   chromedriver                        0x00000001041f1e02 chromedriver + 331266
10  chromedriver                        0x0000000104207d2e chro

100%|██████████| 4584/4584 [3:38:41<00:00,  2.86s/it]    


ValueError: Shape of passed values is (13, 1), indices imply (13, 13)

---
#### ulta

In [None]:
# warnings.filterwarnings(action="ignore")
# pd.set_option('display.max_columns', 100)
# pd.set_option('display.max_rows', 3000)
# pd.set_option('display.width', 2000)

In [None]:
from selenium.webdriver.common.by import By

def get_data_ulta(url):
    driver = get_url(url)
    time.sleep(5)

    big_price_txt, small_price_txt = None, None
    if driver is None:
        normal_price, sale_price = 0, 0
        is_sale, is_use = 0, 0
        status = -2
    
    else:
        # small price가 있으면 -> small price가 정상가, big price가 할인가
        # small price가 없으면 -> big price가 정상가
        is_sale, is_use = 1, 1
        status = 1
        big_price_flag = False           
        try:
            big_price_txt = driver.find_element(By.XPATH, '//*[@id="root"]/div/div/main/div/div/div[3]/div/div[3]/span[1]').text
            big_price = ''.join(x for x in big_price_txt if x not in "Price\n$").replace('Sal', '').strip()
            big_price_flag = True

            small_price_txt = driver.find_element(By.XPATH, '//*[@id="root"]/div/div/main/div/div/div[3]/div/div[3]/span[3]').text
            small_price = ''.join(x for x in small_price_txt if x not in "Original Price\n$")
            reg_drop = r'[\(\)vu]+'
            small_price = re.sub(reg_drop, '', small_price)
            
            normal_price = small_price
            sale_price = big_price

            # check sale
            normal_price = round(float(normal_price), 2)
            sale_price = round(float(sale_price), 2)
            if normal_price > sale_price:
                pass
            else:
                sale_price = 0
                is_sale = 0
            
        except NoSuchElementException:
            if big_price_flag:
                normal_price = big_price
                sale_price = 0
                is_sale = 0
            else:
                normal_price, sale_price = 0, 0
                is_sale, is_use = 0, 0
                status = 0
            
        except Exception as e:
            print(e)
            normal_price, sale_price = 0, 0
            is_sale, is_use = 0, 0
            status = -1
            
    normal_price = round(float(normal_price), 2)
    sale_price = round(float(sale_price), 2)
    driver.quit()
    return status, normal_price, sale_price, is_sale, is_use, big_price_txt, small_price_txt

def get_data():
    df_price = db_glamai.get_tbl('affiliate_price')
    df_ulta = df_price[df_price.affiliate_type=='ulta']
    
    return df_ulta

def _crawling(value):
    product_code = value[0]
    item_no = value[1]
    affiliate_type = 'ulta'
    affiliate_url = value[3]
    affiliate_image = value[4]
    regist_date = value[9]
    
    status, price, sale_price, is_sale, is_use, big_price_txt, small_price_txt = get_data_ulta(affiliate_url)
    data = [product_code, item_no, affiliate_type, affiliate_url, affiliate_image, price, sale_price, is_sale, is_use, regist_date, status, big_price_txt, small_price_txt]
    
    return data

def _upload(data):
    columns = ['product_code', 'item_no', 'affiliate_type', 'affiliate_url', 'affiliate_image', 'price', 'sale_price', 'is_sale', 'is_use', 'regist_date', 'status', 'big_price', 'small_price']
    crawling_df = pd.DataFrame(data, columns=columns)
    
    upload_columns = ['product_code', 'item_no', 'affiliate_type', 'affiliate_url', 'affiliate_image',  'price', 'sale_price', 'is_sale', 'is_use', 'regist_date']
    upload_df = crawling_df.loc[:, upload_columns]
    upload_df.loc[:, 'update_date'] = datetime.today()
    db_jangho.create_table(upload_df=upload_df, table_name='affiliate_price_update_ulta')
    
    return crawling_df, upload_df

def main():
    df_ulta = get_data()
    datas, error = [], []
    for value in tqdm(df_ulta.values):
        data = _crawling(value)
        if data is None:
            affiliate_url = value[3]
            error.append(affiliate_url)
        else:
            datas.append(data)
    crawling_df, upload_df = _upload(datas)
    
# if __name__ == '__main__':
#     main()

In [None]:
df_ulta = get_data()
datas, error = [], []
for value in tqdm(df_ulta.values):
    data = _crawling(value)
    if data is None:
        affiliate_url = value[3]
        error.append(affiliate_url)
    else:
        datas.append(data)
crawling_df, upload_df = _upload(datas)

In [None]:
ulta_df = crawling_df.copy()

ulta_df.groupby('is_use').count()

In [None]:
ulta_df.groupby('is_sale').count()

In [None]:
ulta_df.groupby('status').count()

In [None]:
ulta_df.groupby('price').count()

In [None]:
ulta_df.groupby('sale_price').count()

In [None]:
ulta_df.groupby('big_price').count().index.tolist()
ulta_df.groupby('small_price').count().index.tolist()

In [None]:
ulta_df

In [None]:
upload_columns = ['product_code', 'item_no', 'affiliate_type', 'affiliate_url', 'affiliate_image',  'price', 'sale_price', 'is_sale', 'is_use', 'regist_date']
upload_df = ulta_df.loc[:, upload_columns]
upload_df.loc[:, 'update_date'] = datetime.today()

In [None]:
upload_df = upload_df.sort_values(by=['product_code', 'item_no', 'regist_date', 'update_date'], ignore_index=True)
db_jangho.create_table(upload_df=upload_df, table_name='affiliate_price_update_ulta')

---
### update sale test



In [None]:
from sephora_update.sales import update_sephora_sale
vertical = 'test'
data = update_sephora_sale(vertical)

In [None]:
pd.DataFrame(data)

In [None]:
## Test table ## 

# sale_test = db_glamai.get_tbl('sephora_cleansers_data_sale')
# status_test = db_glamai.get_tbl('sephora_cleansers_data_status')

# db_glamai.engine_upload(upload_df=sale_test, table_name='sephora_test_data_sale', if_exists_option='replace')
# db_glamai.engine_upload(upload_df=status_test, table_name='sephora_test_data_status', if_exists_option='replace')