# 1.1 Imports

In [1]:
# import libraries
import os
import csv
import numpy as np
import pandas as pd

import urllib
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import time
from PIL import Image

In [2]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# create function
# open web browser

def configure_driver():
    # incognito window
    chrome_options = Options()
    chrome_options.add_argument("--incognito")
    
    # set the browser Headless.
    chrome_options.add_argument("--headless")

    # open web browser
    driver = webdriver.Chrome('/Users/abrahamleung/Documents/chromedriver', options=chrome_options)
    return driver

In [4]:
# list of categories
categories = [
    'activewear',
    'jackets',
    'sweatshirts-hoodies'
]

# 1.2 Extracting Links

In [5]:
# create function
# get product links

def product_links(driver, category):
    # category link
    URL = f'https://www.calvinklein.com/hk/en/women-apparel-{category}/'
    
    # navigate webpage
    driver.get(URL)
    
    # may need time sleep
    time.sleep(0.5)
    
    # open text file
    if os.path.exists(f"data/{category}_links.txt"):
        # open exist file
        f = open(f"data/{category}_links.txt", "w")
    else:
        # create new file
        f = open(f"data/{category}_links.txt", "x")
        
    # get all product elements
    product_elements = driver.find_elements_by_xpath('//a[@class="name-link"]')
    
    # get all product links
    f = open(f"data/{category}_links.txt", "a")

    for product_element in product_elements[:15]:
        f.write(product_element.get_attribute('href'))
        f.write('\n')

    f.close()

In [6]:
# scrape all links

# open web browser
driver = configure_driver()

for category in categories:
    
    # extract and save product links
    product_links(driver, category)

# close web browser
driver.close()

# 1.3 Extracting Product Details

In [7]:
# create function
# get product details

def product_detail(driver, URL):
    
    # input website
    driver.get(URL)
    
    # may need time sleep
    time.sleep(0.5)
    
    # get product name
    try:
        name = driver.find_element_by_tag_name('h1')
        name = name.text
    except:
        name = None
    
    # get product price
    try:
        price = driver.find_element_by_xpath('//span[@class="price-sales"]')
        price = price.text
    except:
        price = None
    
    # get product image
    try:
        # size
        img_width,img_height = 300,300

        # get all images
        images = driver.find_elements_by_xpath('//img[@class="primary-image cloudzoom"]')

        # product image at index 0
        img = images[0]

        # 'src' = get image source
        src = img.get_attribute('src')

        # download image
        urllib.request.urlretrieve(src, f'image/{name}.png')

        # resize image (smaller size)
        ori_img = Image.open(f'image/{name}.png')
        resize_img = ori_img.resize((img_width,img_height))
        resize_img.save(f'image/{name}.png')

        img_file = f'image/{name}.png'
    except:
        img_file = None
    
    return name, price, img_file

In [30]:
# open web browser
driver = configure_driver()

# may need time sleep
time.sleep(1)

# for every category, scrape every link

for category in categories:
    
    start_time = time.time()

    names = []
    prices = []
    img_files = []
    urls = []
    
    # load all links

    links = []
    f = open(f'data/{category}_links.txt','r')
    for link in f.read().split():
        links.append(link)
    
    # scrape every link
    
    for link in links:
        url = link
        name, price, img_file = product_detail(driver, link)

        # append data into lists
        names.append(name)
        prices.append(price)
        img_files.append(img_file)
        urls.append(url)
    
    # convert to dataframe

    df = pd.DataFrame({
        'name': names,
        'price': prices,
        'img_file': img_files,
        'url': urls
    })
    
    # remove rows with missing values
    df.dropna(inplace=True)
    
    # reset index
    df.reset_index(drop=True, inplace=True)
    
    # save file
    df.to_csv(f'data/{category}.csv', index=False)

    end_time = time.time()

    # time of scraping, creating dataframe and saving as csv file
    time_taken = end_time - start_time

    print(f'{category}:')
    print(f"- no of items: {len(links)} ")
    print(f"- total time taken: {round(time_taken,2)}s")

    # average time taken for each item
    average_time = time_taken / len(links)

    print(f"- average time for each item: {round(average_time,2)}s")
    print('\n')

# close web browser
driver.close()

jackets:
- no of items: 15 
- total time taken: 36.59s
- average time for each item: 2.44s




In [25]:
# dataframe of 'activewear'

df = pd.read_csv(f'data/{categories[0]}.csv')
df.head()

Unnamed: 0,name,price,img_file,url
0,CK ONE TANGA BIKINI,HKD 392.00,image/CK ONE TANGA BIKINI.png,https://www.calvinklein.com/hk/en/ck-one-tanga...
1,CK ONE BRALETTE,HKD 392.00,image/CK ONE BRALETTE.png,https://www.calvinklein.com/hk/en/ck-one-brale...
2,ACTIVE ICON WOVEN SHORTS,HKD 413.00,image/ACTIVE ICON WOVEN SHORTS.png,https://www.calvinklein.com/hk/en/active-icon-...
3,ACTIVE ICON FULL LENGTH ALL-OVER PRINT LEGGINGS,HKD 483.00,image/ACTIVE ICON FULL LENGTH ALL-OVER PRINT L...,https://www.calvinklein.com/hk/en/active-icon-...
4,LACE LIGHTWEAR JACKET,HKD 903.00,image/LACE LIGHTWEAR JACKET.png,https://www.calvinklein.com/hk/en/lace-lightwe...


In [26]:
# no of sucessful scraped items

len(df)

12

In [32]:
# dataframe of 'jackets'

df = pd.read_csv(f'data/{categories[1]}.csv')
df.head()

Unnamed: 0,name,price,img_file,url
0,REFLECTION PACKABLE ZIP UP HOODIE,"HKD 1,393.00",image/REFLECTION PACKABLE ZIP UP HOODIE.png,https://www.calvinklein.com/hk/en/reflection-p...
1,CHINESE NEW YEAR CAPSULE REVERSIBLE JACKET,"HKD 1,603.00",image/CHINESE NEW YEAR CAPSULE REVERSIBLE JACK...,https://www.calvinklein.com/hk/en/chinese-new-...
2,SHORT HOODED WINDBREAKER,"HKD 1,253.00",image/SHORT HOODED WINDBREAKER.png,https://www.calvinklein.com/hk/en/short-hooded...
3,CHINESE NEW YEAR CAPSULE RELAXED DENIM TRUCKER...,"HKD 1,533.00",image/CHINESE NEW YEAR CAPSULE RELAXED DENIM T...,https://www.calvinklein.com/hk/en/chinese-new-...
4,HYBRID DENIM JACKET,"HKD 2,093.00",image/HYBRID DENIM JACKET.png,https://www.calvinklein.com/hk/en/hybrid-denim...


In [33]:
# no of sucessful scraped items

len(df)

11

In [21]:
# dataframe of 'sweatshirts-hoodies'

df = pd.read_csv(f'data/{categories[2]}.csv')
df.head()

Unnamed: 0,name,price,img_file,url
0,MODERN ESSENTIALS+ MICRO LOGO HOODIE,HKD 763.00,image/MODERN ESSENTIALS+ MICRO LOGO HOODIE.png,https://www.calvinklein.com/hk/en/modern-essen...
1,REFLECTION HOODED CAPE,HKD 903.00,image/REFLECTION HOODED CAPE.png,https://www.calvinklein.com/hk/en/reflection-h...
2,REFLECTION INSTITUTIONAL ZIP UP HOODIE,HKD 903.00,image/REFLECTION INSTITUTIONAL ZIP UP HOODIE.png,https://www.calvinklein.com/hk/en/reflection-i...
3,PREMIUM MIX MEDIA HOODIE,"HKD 1,183.00",image/PREMIUM MIX MEDIA HOODIE.png,https://www.calvinklein.com/hk/en/premium-mix-...
4,PREMIUM FLORAL PRINT SWEATSHIRT,HKD 903.00,image/PREMIUM FLORAL PRINT SWEATSHIRT.png,https://www.calvinklein.com/hk/en/premium-flor...


In [22]:
# no of sucessful scraped items

len(df)

12

In [12]:
# END