# 1.1 Imports

In [1]:
# import libraries
import os
import csv
import numpy as np
import pandas as pd

import urllib
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import time
from PIL import Image

In [2]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [58]:
# create function
# open web browser

def open_web_browser():
    # incognito window
    chrome_options = Options()
    chrome_options.add_argument("--incognito")
    
    # set the browser Headless.
    chrome_options.add_argument("--headless")

    # open web browser
    driver = webdriver.Chrome('/Users/abrahamleung/Documents/chromedriver', options=chrome_options)
    return driver

In [4]:
# list of categories
categories = [
    'activewear',
    'jackets',
    'sweatshirts-hoodies'
]

# 1.2 Extracting Links

In [5]:
# create function
# get product links

def product_links(driver, category):
    # category link
    URL = f'https://www.calvinklein.com/hk/en/women-apparel-{category}/'
    
    # navigate webpage
    driver.get(URL)
    
    # may need time sleep
    time.sleep(0.5)
    
    # open text file
    if os.path.exists(f"data/{category}_links.txt"):
        # open exist file
        f = open(f"data/{category}_links.txt", "w")
    else:
        # create new file
        f = open(f"data/{category}_links.txt", "x")
        
    # get all product elements
    product_elements = driver.find_elements_by_xpath('//a[@class="name-link"]')
    
    # get all product links
    f = open(f"data/{category}_links.txt", "a")

    for product_element in product_elements[:15]:
        f.write(product_element.get_attribute('href'))
        f.write('\n')

    f.close()

In [6]:
# scrape all links

# open web browser
driver = open_web_browser()

for category in categories:
    
    # extract and save product links
    product_links(driver, category)

# close web browser
driver.close()

# 1.3 Extracting Product Details

In [7]:
# create function
# get product details

def product_detail(driver, URL):
    
    # input website
    driver.get(URL)
    
    # may need time sleep
    time.sleep(0.5)
    
    # get product name
    try:
        name = driver.find_element_by_tag_name('h1')
        name = name.text
    except:
        name = None
    
    # get product price
    try:
        price = driver.find_element_by_xpath('//span[@class="price-sales"]')
        price = price.text
    except:
        price = None
    
    # get product image
    try:
        # size
        img_width,img_height = 300,300

        # get all images
        images = driver.find_elements_by_xpath('//img[@class="primary-image cloudzoom"]')

        # product image at index 0
        img = images[0]

        # 'src' = get image source
        src = img.get_attribute('src')

        # download image
        urllib.request.urlretrieve(src, f'image/{name}.png')

        # resize image (smaller size)
        ori_img = Image.open(f'image/{name}.png')
        resize_img = ori_img.resize((img_width,img_height))
        resize_img.save(f'image/{name}.png')

        img_file = f'image/{name}.png'
    except:
        img_file = None
    
    return name, price, img_file

In [59]:
# open web browser
driver = open_web_browser()

# for every category, scrape every link

for category in categories:
    
    start_time = time.time()

    names = []
    prices = []
    img_files = []
    urls = []
    
    # load all links

    links = []
    f = open(f'data/{category}_links.txt','r')
    for link in f.read().split():
        links.append(link)
    
    # scrape every link
    
    for link in links:
        url = link
        name, price, img_file = product_detail(driver, link)

        # append data into lists
        names.append(name)
        prices.append(price)
        img_files.append(img_file)
        urls.append(url)
    
    # convert to dataframe

    df = pd.DataFrame({
        'name': names,
        'price': prices,
        'img_file': img_files,
        'url': urls
    })
    
    # remove rows with missing values
    df.dropna(inplace=True)
    
    # reset index
    df.reset_index(drop=True, inplace=True)
    
    # save file
    df.to_csv(f'data/{category}.csv', index=False)

    end_time = time.time()

    # time of scraping, creating dataframe and saving as csv file
    time_taken = end_time - start_time

    print(f'{category}:')
    print(f"- no of items: {len(links)} ")
    print(f"- total time taken: {round(time_taken,2)}s")

    # average time taken for each item
    average_time = time_taken / len(links)

    print(f"- average time for each item: {round(average_time,2)}s")
    print('\n')

# close web browser
driver.close()

activewear:
- no of items: 15 
- total time taken: 12.39933180809021


jackets:
- no of items: 15 
- total time taken: 11.057486772537231


sweatshirts-hoodies:
- no of items: 15 
- total time taken: 11.013481140136719




# 1.4 (Unsuccessful) Scraping with Multiple-threading

In [5]:
# try multi-threading - creating empty files

for category in categories:

    # create dataframe

    df = pd.DataFrame({
        'name': [],
        'price': [],
        'img_file': [],
        'url': []
    })
    
    # save file
    df.to_csv(f'data/{category}.csv', index=False)

In [9]:
# try multi-threading - scraping function

def product_detail(driver, category, URL):
    
    # open web browser
    # driver = open_web_browser()
    
    # input website
    driver.get(URL)
    
    # may need time sleep
    time.sleep(0.5)
    
    # get product name
    try:
        name = driver.find_element_by_tag_name('h1')
        name = name.text
    except:
        name = None
    
    # get product price
    try:
        price = driver.find_element_by_xpath('//span[@class="price-sales"]')
        price = price.text
    except:
        price = None
    
    # get product image
    try:
        # size
        img_width,img_height = 300,300

        # get all images
        images = driver.find_elements_by_xpath('//img[@class="primary-image cloudzoom"]')

        # product image at index 0
        img = images[0]

        # 'src' = get image source
        src = img.get_attribute('src')

        # download image
        urllib.request.urlretrieve(src, f'image/{name}.png')

        # resize image (smaller size)
        ori_img = Image.open(f'image/{name}.png')
        resize_img = ori_img.resize((img_width,img_height))
        resize_img.save(f'image/{name}.png')

        img_file = f'image/{name}.png'
    except:
        img_file = None
    
    # close web browser
    # driver.close()
    
    print(driver, name, price, img_file)
    
    with open(f'data/{category}.csv', mode='a') as f:
        writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow([name, price, img_file, URL])

In [10]:
# testing
category = 'activewear'

In [16]:
# try multi-threading - main function

start_time = time.time()

no_of_threads = 2

# load all links

links = []
f = open(f'data/{category}_links.txt','r')
for link in f.read().split():
    links.append(link)
    
# testing for 5 links
links = links[:4]

# args = ((driver,category,link) for link in links)

# open unique driver for each thread
drivers = []
for i in range(no_of_threads):
    drivers.append(open_web_browser())

# args = ((driver1, category, links[0]),(driver2, category, links[1]))
# create args with corresponding drivers
# append driver, category and link by list
args = []
for i in range(0, len(links), no_of_threads):
    for j in range(no_of_threads):
        try:
            args.append((drivers[j], category, links[i+j]))
        except:
            pass
args = tuple(args)

with ThreadPoolExecutor(max_workers=no_of_threads) as executor:
    executor.map(lambda x: product_detail(*x),args)

# close web browser
for i in range(no_of_threads):
    drivers[i].close()

end_time = time.time()
print("total time taken:", end_time-start_time)

<selenium.webdriver.chrome.webdriver.WebDriver (session="375612317f2c794172f631d3ecb37b74")> LACE LIGHTWEAR JACKET HKD 903.00 None
<selenium.webdriver.chrome.webdriver.WebDriver (session="fa14beddbd8ff3448ad9e7a4e92e0422")> LOGO SHORT SLEEVE TEE HKD 273.00 None
<selenium.webdriver.chrome.webdriver.WebDriver (session="375612317f2c794172f631d3ecb37b74")> LACE FITSENSE MESH HEM LEGGINGS HKD 553.00 None
<selenium.webdriver.chrome.webdriver.WebDriver (session="fa14beddbd8ff3448ad9e7a4e92e0422")> BACK V LACE SHORT SLEEVE DRESS HKD 623.00 None
total time taken: 10.159064054489136


In [15]:
links

['https://www.calvinklein.com/hk/en/lace-lightwear-jacket-4WT1O553.html?dwvar_4WT1O553_color=BRIGHT%20WHITE',
 'https://www.calvinklein.com/hk/en/logo-short-sleeve-tee-4WT1K131.html?dwvar_4WT1K131_color=BRIGHT%20WHITE',
 'https://www.calvinklein.com/hk/en/lace-fitsense-mesh-hem-leggings-4WT1L784.html?dwvar_4WT1L784_color=CK%20BLACK',
 'https://www.calvinklein.com/hk/en/back-v-lace-short-sleeve-dress-4WS1D981.html?dwvar_4WS1D981_color=CK%20BLACK']

In [14]:
# performance
driver3.close()

NameError: name 'driver3' is not defined

In [11]:
# dataframe of 'activewear'

df = pd.read_csv(f'data/{categories[0]}.csv')
df.head()

Unnamed: 0,name,price,img_file,url
0,LACE LIGHTWEAR JACKET,HKD 903.00,image/LACE LIGHTWEAR JACKET.png,https://www.calvinklein.com/hk/en/lace-lightwe...
1,LOGO SHORT SLEEVE TEE,HKD 390.00,image/LOGO SHORT SLEEVE TEE.png,https://www.calvinklein.com/hk/en/logo-short-s...
2,LACE FITSENSE MESH HEM LEGGINGS,HKD 553.00,image/LACE FITSENSE MESH HEM LEGGINGS.png,https://www.calvinklein.com/hk/en/lace-fitsens...
3,BACK V LACE SHORT SLEEVE DRESS,HKD 623.00,image/BACK V LACE SHORT SLEEVE DRESS.png,https://www.calvinklein.com/hk/en/back-v-lace-...
4,CK ONE TANGA BIKINI,HKD 392.00,image/CK ONE TANGA BIKINI.png,https://www.calvinklein.com/hk/en/ck-one-tanga...
5,ACTIVE ICON FULL LENGTH ALL-OVER PRINT LEGGINGS,HKD 483.00,image/ACTIVE ICON FULL LENGTH ALL-OVER PRINT L...,https://www.calvinklein.com/hk/en/active-icon-...
6,ACTIVE ICON HOODED SWEAT JACKET,HKD 693.00,image/ACTIVE ICON HOODED SWEAT JACKET.png,https://www.calvinklein.com/hk/en/active-icon-...
7,CK ONE BRALETTE,HKD 392.00,image/CK ONE BRALETTE.png,https://www.calvinklein.com/hk/en/ck-one-brale...
8,LOGO SHORT SLEEVE TEE,HKD 390.00,image/LOGO SHORT SLEEVE TEE.png,https://www.calvinklein.com/hk/en/logo-short-s...
9,PRIDE RAINBOW TAPE HOODED JACKET,"HKD 1,290.00",image/PRIDE RAINBOW TAPE HOODED JACKET.png,https://www.calvinklein.com/hk/en/pride-rainbo...


In [12]:
# dataframe of 'jackets'

df = pd.read_csv(f'data/{categories[1]}.csv')
df.head()

Unnamed: 0,name,price,img_file,url
0,REFLECTION PACKABLE ZIP UP HOODIE,"HKD 1,393.00",image/REFLECTION PACKABLE ZIP UP HOODIE.png,https://www.calvinklein.com/hk/en/reflection-p...
1,CORE NYLON WINDBREAKER,"HKD 1,290.00",image/CORE NYLON WINDBREAKER.png,https://www.calvinklein.com/hk/en/core-nylon-w...
2,REFLECTION PACKABLE ZIP UP HOODIE,"HKD 1,393.00",image/REFLECTION PACKABLE ZIP UP HOODIE.png,https://www.calvinklein.com/hk/en/reflection-p...
3,CORE NYLON WINDBREAKER,"HKD 1,290.00",image/CORE NYLON WINDBREAKER.png,https://www.calvinklein.com/hk/en/core-nylon-w...
4,PRIDE CROPPED DENIM VEST,"HKD 1,290.00",image/PRIDE CROPPED DENIM VEST.png,https://www.calvinklein.com/hk/en/pride-croppe...
5,SHORT HOODED WINDBREAKER,"HKD 1,253.00",image/SHORT HOODED WINDBREAKER.png,https://www.calvinklein.com/hk/en/short-hooded...
6,HYBRID DENIM JACKET,"HKD 2,093.00",image/HYBRID DENIM JACKET.png,https://www.calvinklein.com/hk/en/hybrid-denim...
7,CHECKED SUIT JACKET,"HKD 1,716.00",image/CHECKED SUIT JACKET.png,https://www.calvinklein.com/hk/en/checked-suit...
8,ZIP UP HOODED JACKET,HKD 597.00,image/ZIP UP HOODED JACKET.png,https://www.calvinklein.com/hk/en/zip-up-hoode...
9,WOVEN HOODED JACKET,"HKD 1,095.00",image/WOVEN HOODED JACKET.png,https://www.calvinklein.com/hk/en/woven-hooded...


In [13]:
# dataframe of 'sweatshirts-hoodies'

df = pd.read_csv(f'data/{categories[2]}.csv')
df.head()

Unnamed: 0,name,price,img_file,url
0,MICRO BRANDING HOOD,HKD 763.00,image/MICRO BRANDING HOOD.png,https://www.calvinklein.com/hk/en/modern-essen...
1,PRIDE CROPPED HOODIE,"HKD 1,190.00",image/PRIDE CROPPED HOODIE.png,https://www.calvinklein.com/hk/en/pride-croppe...
2,ORGANIC COTTON MONOGRAM SWEATSHIRT,HKD 890.00,image/ORGANIC COTTON MONOGRAM SWEATSHIRT.png,https://www.calvinklein.com/hk/en/organic-cott...
3,MODERN ESSENTIALS+ MICRO LOGO HOODIE,HKD 763.00,image/MODERN ESSENTIALS+ MICRO LOGO HOODIE.png,https://www.calvinklein.com/hk/en/modern-essen...
4,PRIDE ALL OVER PRINT HOODIE,"HKD 1,290.00",image/PRIDE ALL OVER PRINT HOODIE.png,https://www.calvinklein.com/hk/en/pride-all-ov...
5,REFLECTION INSTITUTIONAL ZIP UP HOODIE,HKD 903.00,image/REFLECTION INSTITUTIONAL ZIP UP HOODIE.png,https://www.calvinklein.com/hk/en/reflection-i...
6,PRIDE LOGO CROPPED SWEATSHIRT,"HKD 1,090.00",image/PRIDE LOGO CROPPED SWEATSHIRT.png,https://www.calvinklein.com/hk/en/pride-logo-c...
7,PRIDE LOGO CROPPED SWEATSHIRT,"HKD 1,090.00",image/PRIDE LOGO CROPPED SWEATSHIRT.png,https://www.calvinklein.com/hk/en/pride-logo-c...
8,OMBRE GRADIENT LOGO ORGANIC COTTON SWEATSHIRT,HKD 890.00,image/OMBRE GRADIENT LOGO ORGANIC COTTON SWEAT...,https://www.calvinklein.com/hk/en/ombre-gradie...
9,LOGO TAPE ZIP THROUGH HOODIE,"HKD 1,090.00",image/LOGO TAPE ZIP THROUGH HOODIE.png,https://www.calvinklein.com/hk/en/logo-tape-zi...


In [14]:
# END