In [1]:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import concurrent.futures
from concurrent.futures import as_completed
from tqdm import tqdm
import pandas as pd
import numpy as np





def get_last_page(sec_name, sec_url):

    section_url = f'https://www.lazada.co.th{sec_url}?page=1'
    
    print(f'Processing {sec_name} Section...')
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    driver = webdriver.Chrome(options=options)
    driver.get(section_url)

    print(f'Searching for {sec_name} Section Last Page...')

    element = driver.find_element(By.CLASS_NAME, 'e5J1n')
    li_elements = element.find_elements(By.TAG_NAME, 'li')
    new_li_elements = [li.text for li in li_elements]
    
    driver.quit()

    return int(new_li_elements[-2])  # -2 because the last element is "Next Page"



def scrape_page(page_number, sec_name, section_url):

    
    try:
        url = f"{section_url}/?page={page_number}"

        options = Options()
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')

        driver = webdriver.Chrome(options=options)
        driver.get(url)

        product_divs = driver.find_elements_by_css_selector('div[data-qa-locator="product-item"]')

        products = []
        for div in product_divs:
            # ... extract product details ...
            product_id = div.get_attribute('data-item-id')  # Extracting product ID
            product_section = sec_name
            product_name = div.find_element_by_css_selector('a[title]').get_attribute('title')
            product_price = div.find_element_by_css_selector('span.ooOxS').text
            
            try : 
                product_sold = div.find_element_by_css_selector('span._1cEkb span').text
            except : 
                product_sold = np.nan

            try : 
                product_review = div.find_element_by_css_selector('span.qzqFw').text
            except:
                product_review = np.nan
        
            try :
                product_location = div.find_element_by_css_selector('span.oa6ri').get_attribute('title')
            except:
                prodcut_location = np.nan
                
            product_url = div.find_element_by_css_selector('a[href]').get_attribute('href')


            products.append((product_id,
                             product_section,
                             product_name,
                             product_price,
                             product_sold,
                             product_review,
                             product_location,
                             product_url,
                             page_number))

    except Exception as exc:
        print(f"scrape_page ==> An error occurred on page {page_number}: {exc}")

    finally:
        driver.quit()
        if products != []:
            # print(f"{len(products)} products were found in {sec_name}")
            pass
        else:
            print(f"Error occured in page {page_number} of {sec_name}")

    return products





def scrape_section(sec_name, sec_url, last_pages):
        
    print('='*10 + f'START of {sec_name} Section' + '='*10)
    print(f'{sec_name} Section total pages : {last_pages[sec_name]}')
    print(f'Web scrapping {sec_name} Section...')
    
    section_url = f'https://www.lazada.co.th{sec_url}'

    with concurrent.futures.ThreadPoolExecutor() as executor:
        pages = list(range(1, last_pages[sec_name] + 1))
        futures = {executor.submit(scrape_page, page, sec_name, section_url) for page in pages}
        
        results = []
        for future in tqdm(as_completed(futures), total=len(pages)):
            results.append(future.result())

    flattened_results = [product for sublist in results for product in sublist]

    df = pd.DataFrame(flattened_results, columns=['Id', 'Section', 'Name', 'Price', 'Total Sold', 'Total Reviews', 'Shop Location', 'URL', 'Catalog Pages'])

    print(f'{len(df)} rows of products were created')
    print('='*10 + f'END of {sec_name} Section' + '='*10)
    print("\n\n\n")
    
    return df
   

    
    
def save_dataframe(dataframe, name, time):

    # Concatenate the timestamp to the file name
    file_name = f"{name}_{time}.csv"

    # Save the DataFrame to a CSV file with the updated file name
    dataframe.to_csv(file_name, index=False)
                 



health_section = {
    "Acne Care": "/shop-acne-care/",
    "Beauty Supplements Value Sets": "/shop-beauty-supplements-gifts/",
    "Breast Enlargement": "/shop-breast-enlargement-supplements/?spm=a2o4m.searchlistcategory.funnel_filter.d1.1e4f4a5cyLrE1m",
    "Well Being Gifts & Value Sets": "/shop-well-being-gifts/",
    "Bone & Joint Support": "/shop-bones-joints/",
    "Skin Nourishment": "/shop-skin-supplements/",
    "Multivitamins": "/shop-multivitamin-supplements/",
    "Digestive Care": "/shop-digestion-and-absorption/",
    "Herbs & Traditional Medicine": "/shop-herbs-traditional-medicine/",
    "Whitening": "/shop-whitening-supplements/",
    "Nutritional Foods & Drinks": "/shop-nutritional-foods-drinks/",
    "Brain & Memory": "/shop-brain-memory/",
    "Immunity": "/shop-immune-system/",
    "Protein": "/shop-protein/",
    "Slimming Beverages": "/shop-slimming-beverages/",
    "Heart & Blood Pressure": "/shop-heart-cholesterol/",
    "Men's Health": "/shop-mens-health/",
    "Stress, Sleep, and Anxiety": "/shop-stress-anxiety-and-depression/",
    "Hair & Nail": "/shop-hair-nail-supplements/",
    "Sexual Health Vitamins": "/shop-sexual-health/",
    "Appetite Suppressant": "/shop-appetite-suppressants/",
    "Women's Health": "/shop-women-and-menopause/",
    "Meal Replacement": "/shop-meal-replacement/",
    "Weight Management Value Sets": "/shop-weight-management-gifts/",
    "Fat Blockers & Burners": "/shop-fat-burners/",
    "Pre-Workout": "/shop-pre-workout/",
    "Mass Gainer": "/shop-weight-gain-supplements/",
    "Detoxification": "/shop-liver-detox/",
    "Pregnancy Care": "/shop-pregnancy-care/",
    "Food Supplement": "/shop-health-food-supplements-weight-management/"
}


def create_lastPagesDict():
    
    ''' 
    Create dictionary contain number of last page 
    '''

    last_pages = {}

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = {executor.submit(get_last_page, sec_name, sec_url): sec_name for sec_name, sec_url in health_section.items()}

    for future in concurrent.futures.as_completed(futures):
        sec_name = futures[future]
        try:
            data = future.result()
            last_pages[sec_name] = data
        except Exception as exc:
            print(f'Error occured while finding last pages in {sec_name}\n generated an exception: {exc}')
        else:
            print(f'{sec_name} Page Number is: {data}')

    print('='*15 + f'Finding Last Pages END' + '='*15)
    print("\n\n\n")

    return last_pages
        
        
   

In [2]:
# Load pre-build last_pages dictionaty
import pandas as pd
d = pd.read_csv('last_pages_dict_20230808_125116.csv', )
last_pages = dict(d.values)
last_pages

{'Food Supplement': 102}

In [3]:
%%time

from datetime import datetime

# Generate the current timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")




dfs = []

for sec_name, sec_url in health_section.items():
    
    try :
        df = scrape_section(sec_name, sec_url, last_pages)
    
        # Save output dataframe as csv file
        save_dataframe(df, sec_name, timestamp)

        dfs.append(df)
        print(f'Now total dataframe are {len(dfs)} sets')
        
    except Exception as exc:
        print(f'Error : {exc}')
        

# df_concatenated = pd.concat(dfs).reset_index(drop=True)

# # Save output final dataframe as csv file
# save_dataframe(df_concatenated, "health_and_wellness", timestamp)
# df_concatenated

Error : 'Acne Care'
Error : 'Beauty Supplements Value Sets'
Error : 'Breast Enlargement'
Error : 'Well Being Gifts & Value Sets'
Error : 'Bone & Joint Support'
Error : 'Skin Nourishment'
Error : 'Multivitamins'
Error : 'Digestive Care'
Error : 'Herbs & Traditional Medicine'
Error : 'Whitening'
Error : 'Nutritional Foods & Drinks'
Error : 'Brain & Memory'
Error : 'Immunity'
Error : 'Protein'
Error : 'Slimming Beverages'
Error : 'Heart & Blood Pressure'
Error : "Men's Health"
Error : 'Stress, Sleep, and Anxiety'
Error : 'Hair & Nail'
Error : 'Sexual Health Vitamins'
Error : 'Appetite Suppressant'
Error : "Women's Health"
Error : 'Meal Replacement'
Error : 'Weight Management Value Sets'
Error : 'Fat Blockers & Burners'
Error : 'Pre-Workout'
Error : 'Mass Gainer'
Error : 'Detoxification'
Error : 'Pregnancy Care'
Food Supplement Section total pages : 102
Web scrapping Food Supplement Section...


100%|████████████████████████████████████████████████████████████████████████████████| 102/102 [05:03<00:00,  2.98s/it]

1020 rows of products were created




Now total dataframe are 1 sets
CPU times: total: 28.4 s
Wall time: 5min 3s





In [None]:
# Function to make a link clickable
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val, val)

# Apply the function to the 'Product URL' column
dfs_styled = df_concatenated.style.format({'Product URL': make_clickable})
dfs_styled

In [None]:
"""
Create Dataframe
"""

from datetime import datetime

# Assuming df_concatenated contains your DataFrame

# Generate the current timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Concatenate the timestamp to the file name
file_name = f"health_and_wellness_{timestamp}.csv"

# Save the DataFrame to a CSV file with the updated file name
df_concatenated.to_csv(file_name, index=False)


In [2]:
# Generate timestamp

from datetime import datetime

current_timestamp = datetime.now()

# Custom format for the timestamp
custom_format = "%Y-%m-%d_%H-%M-%S"

# Print the timestamp with the custom format
print(current_timestamp.strftime(custom_format))

2023-08-03_16-42-58


In [3]:
last_pages = {
 'Food Supplement': 102,
}

In [4]:
from datetime import datetime
import pandas as pd


# Generate the current timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Concatenate the timestamp to the file name
file_name = f"last_pages_dict_{timestamp}.csv"

# Save the DataFrame to a CSV file with the updated file name
d = pd.DataFrame(last_pages.items())
d.to_csv(file_name, index=False)