# Capstone Project: Data Scraping for Independent Set

In this code notebook, we will create an independent dataset on which to deploy the model. The data will be scraped from Singapore brand [GRAYE](https://grayestudio.com), which markets itself as a sustainable designer label. 

In [1]:
# import necessary libraries

import pandas as pd
import numpy as np
import pickle

from bs4 import BeautifulSoup
import requests
import re

### Get urls for tshirt products

In [2]:
# check requests

tee_url = 'https://grayestudio.com/collections/t-shirts'
tee_res = requests.get(tee_url)
tee_res.status_code

200

In [3]:
# check scraped text

tee_soup = BeautifulSoup(tee_res.content, 'lxml')
tee_soup.body

<body><div class="edit_popup" style="display: none;">
<form class="edit_form" data-action="/cart/add" enctype="multipart/form-data" id="edit_cart_popup" method="post">
</form>
</div><!-- END app snippet --><!-- END app app block --><script defer="defer" src="https://cdn.shopify.com/extensions/07fd1ccd-f05b-4d31-94d4-a07a2e88cdad/0.4.0/assets/omnisend-in-shop.js" type="text/javascript"></script>
<script defer="defer" src="https://cdn.shopify.com/extensions/4ab84696-81cc-4661-9fb4-2578a25bce95/0.94.0/assets/appointo_bundle.js" type="text/javascript"></script>
<script defer="defer" src="https://cdn.shopify.com/extensions/0dc675f6-6451-4e34-ac60-441d0df6340d/5.38.0/assets/hulkcode.js" type="text/javascript"></script>
<link href="https://cdn.shopify.com/extensions/0dc675f6-6451-4e34-ac60-441d0df6340d/5.38.0/assets/hulkcode.css" media="all" rel="stylesheet" type="text/css"/>
<link href="https://monorail-edge.shopifysvc.com" rel="dns-prefetch"/>
<script>(function(){if ("sendBeacon" in navigat

In [4]:
tee_text = tee_soup.find_all('div', {'class':'product__grid__info'})

In [5]:
# get partial product url

tee_links = []

for a in tee_text:
    anchor_tag = a.find('a')
    if anchor_tag:
        link = anchor_tag.get('href')
        tee_links.append(link)

In [6]:
# get full url

from urllib.parse import urljoin

tee_links_list = []

for i in range(len(tee_links)):
    rel_url = tee_links[i]
    full_url = urljoin('https://grayestudio.com', rel_url)
    tee_links_list.append(full_url)

In [7]:
# get product name

product = []

for url in tee_links_list:
    t_url = url
    t_res = requests.get(t_url)
    t_soup = BeautifulSoup(t_res.content, 'lxml')

    product_text = t_soup.find('h1', {'class':'product__title heading-size-8'}).text.strip() 
    product.append(product_text)

In [8]:
# get material composition

material = []

for url in tee_links_list:
    t_url = url
    t_res = requests.get(t_url)
    t_soup = BeautifulSoup(t_res.content, 'lxml')

    for a in t_soup.find_all('div', {'class':'tab-content tab-content-1'}):
        anchor_tag = a.find('span', {'class':'metafield-multi_line_text_field'} )
        if anchor_tag:
            text = anchor_tag.text.strip()
            material.append(text)

In [9]:
# get care instructions

wash_care = []

for url in tee_links_list:
    t_url = url
    t_res = requests.get(t_url)
    t_soup = BeautifulSoup(t_res.content, 'lxml')

    for a in t_soup.find_all('div', {'class':'tab-content tab-content-4'}):
        anchor_tag = a.find('span', {'class':'metafield-multi_line_text_field'} )
        if anchor_tag:
            text = anchor_tag.text.strip()
            wash_care.append(text)

### Get urls for top products

In [10]:
# check requests

top_url = 'https://grayestudio.com/collections/tops'
top_res = requests.get(top_url)
top_res.status_code

200

In [11]:
# check scraped text

top_soup = BeautifulSoup(top_res.content, 'lxml')
top_soup.body

<body><div class="edit_popup" style="display: none;">
<form class="edit_form" data-action="/cart/add" enctype="multipart/form-data" id="edit_cart_popup" method="post">
</form>
</div><!-- END app snippet --><!-- END app app block --><script defer="defer" src="https://cdn.shopify.com/extensions/4ab84696-81cc-4661-9fb4-2578a25bce95/0.94.0/assets/appointo_bundle.js" type="text/javascript"></script>
<script defer="defer" src="https://cdn.shopify.com/extensions/0dc675f6-6451-4e34-ac60-441d0df6340d/5.38.0/assets/hulkcode.js" type="text/javascript"></script>
<link href="https://cdn.shopify.com/extensions/0dc675f6-6451-4e34-ac60-441d0df6340d/5.38.0/assets/hulkcode.css" media="all" rel="stylesheet" type="text/css"/>
<script defer="defer" src="https://cdn.shopify.com/extensions/07fd1ccd-f05b-4d31-94d4-a07a2e88cdad/0.4.0/assets/omnisend-in-shop.js" type="text/javascript"></script>
<link href="https://monorail-edge.shopifysvc.com" rel="dns-prefetch"/>
<script>(function(){if ("sendBeacon" in navigat

In [12]:
top_text = top_soup.find_all('div', {'class':'product__grid__info'})

In [13]:
# get partial product url

top_links = []

for a in top_text:
    anchor_tag = a.find('a')
    if anchor_tag:
        link = anchor_tag.get('href')
        top_links.append(link)

In [14]:
# get full url

top_links_list = []

for i in range(len(top_links)):
    rel_url = top_links[i]
    full_url = urljoin('https://grayestudio.com', rel_url)
    top_links_list.append(full_url)

In [15]:
# get product name

for url in top_links_list:
    t_url = url
    t_res = requests.get(t_url)
    t_soup = BeautifulSoup(t_res.content, 'lxml')

    product_text = t_soup.find('h1', {'class':'product__title heading-size-8'}).text.strip() 
    product.append(product_text)

In [16]:
# get material composition

for url in top_links_list:
    t_url = url
    t_res = requests.get(t_url)
    t_soup = BeautifulSoup(t_res.content, 'lxml')

    for a in t_soup.find_all('div', {'class':'tab-content tab-content-1'}):
        anchor_tag = a.find('span', {'class':'metafield-multi_line_text_field'} )
        if anchor_tag:
            text = anchor_tag.text.strip()
            material.append(text)

In [17]:
# get care instructions

for url in top_links_list:
    t_url = url
    t_res = requests.get(t_url)
    t_soup = BeautifulSoup(t_res.content, 'lxml')

    for a in t_soup.find_all('div', {'class':'tab-content tab-content-4'}):
        anchor_tag = a.find('span', {'class':'metafield-multi_line_text_field'} )
        if anchor_tag:
            text = anchor_tag.text.strip()
            wash_care.append(text)

### Get urls for outerwear products

In [18]:
# check requests

ow_url = 'https://grayestudio.com/collections/outerwears'
ow_res = requests.get(ow_url)
ow_res.status_code

200

In [19]:
# check scraped text

ow_soup = BeautifulSoup(ow_res.content, 'lxml')
ow_soup.body

<body><div class="edit_popup" style="display: none;">
<form class="edit_form" data-action="/cart/add" enctype="multipart/form-data" id="edit_cart_popup" method="post">
</form>
</div><!-- END app snippet --><!-- END app app block --><script defer="defer" src="https://cdn.shopify.com/extensions/0dc675f6-6451-4e34-ac60-441d0df6340d/5.38.0/assets/hulkcode.js" type="text/javascript"></script>
<link href="https://cdn.shopify.com/extensions/0dc675f6-6451-4e34-ac60-441d0df6340d/5.38.0/assets/hulkcode.css" media="all" rel="stylesheet" type="text/css"/>
<script defer="defer" src="https://cdn.shopify.com/extensions/4ab84696-81cc-4661-9fb4-2578a25bce95/0.94.0/assets/appointo_bundle.js" type="text/javascript"></script>
<script defer="defer" src="https://cdn.shopify.com/extensions/07fd1ccd-f05b-4d31-94d4-a07a2e88cdad/0.4.0/assets/omnisend-in-shop.js" type="text/javascript"></script>
<link href="https://monorail-edge.shopifysvc.com" rel="dns-prefetch"/>
<script>(function(){if ("sendBeacon" in navigat

In [20]:
ow_text = ow_soup.find_all('div', {'class':'product__grid__info'})

In [21]:
# get partial product url

ow_links = []

for a in ow_text:
    anchor_tag = a.find('a')
    if anchor_tag:
        link = anchor_tag.get('href')
        ow_links.append(link)

In [22]:
# get full url

ow_links_list = []

for i in range(len(ow_links)):
    rel_url = ow_links[i]
    full_url = urljoin('https://grayestudio.com', rel_url)
    ow_links_list.append(full_url)

In [23]:
# get product name

for url in ow_links_list:
    o_url = url
    o_res = requests.get(o_url)
    o_soup = BeautifulSoup(o_res.content, 'lxml')

    product_text = o_soup.find('h1', {'class':'product__title heading-size-8'}).text.strip() 
    product.append(product_text)

In [24]:
# get material composition

for url in ow_links_list:
    o_url = url
    o_res = requests.get(o_url)
    o_soup = BeautifulSoup(o_res.content, 'lxml')

    for a in o_soup.find_all('div', {'class':'tab-content tab-content-1'}):
        anchor_tag = a.find('span', {'class':'metafield-multi_line_text_field'} )
        if anchor_tag:
            text = anchor_tag.text.strip()
            material.append(text)

In [25]:
# get care instructions

for url in ow_links_list:
    o_url = url
    o_res = requests.get(o_url)
    o_soup = BeautifulSoup(o_res.content, 'lxml')

    for a in o_soup.find_all('div', {'class':'tab-content tab-content-4'}):
        anchor_tag = a.find('span', {'class':'metafield-multi_line_text_field'} )
        if anchor_tag:
            text = anchor_tag.text.strip()
            wash_care.append(text)

### Get urls for bottoms products

In [26]:
# check requests

b_url = 'https://grayestudio.com/collections/pants'
b_res = requests.get(b_url)
b_res.status_code

200

In [27]:
# check scraped text

b_soup = BeautifulSoup(b_res.content, 'lxml')
b_soup.body

<body><div class="edit_popup" style="display: none;">
<form class="edit_form" data-action="/cart/add" enctype="multipart/form-data" id="edit_cart_popup" method="post">
</form>
</div><!-- END app snippet --><!-- END app app block --><script defer="defer" src="https://cdn.shopify.com/extensions/07fd1ccd-f05b-4d31-94d4-a07a2e88cdad/0.4.0/assets/omnisend-in-shop.js" type="text/javascript"></script>
<script defer="defer" src="https://cdn.shopify.com/extensions/0dc675f6-6451-4e34-ac60-441d0df6340d/5.38.0/assets/hulkcode.js" type="text/javascript"></script>
<link href="https://cdn.shopify.com/extensions/0dc675f6-6451-4e34-ac60-441d0df6340d/5.38.0/assets/hulkcode.css" media="all" rel="stylesheet" type="text/css"/>
<script defer="defer" src="https://cdn.shopify.com/extensions/4ab84696-81cc-4661-9fb4-2578a25bce95/0.94.0/assets/appointo_bundle.js" type="text/javascript"></script>
<link href="https://monorail-edge.shopifysvc.com" rel="dns-prefetch"/>
<script>(function(){if ("sendBeacon" in navigat

In [28]:
b_text = b_soup.find_all('div', {'class':'product__grid__info'})

In [29]:
# get partial product url

b_links = []

for a in b_text:
    anchor_tag = a.find('a')
    if anchor_tag:
        link = anchor_tag.get('href')
        b_links.append(link)

In [30]:
# get full url

b_links_list = []

for i in range(len(b_links)):
    rel_url = b_links[i]
    full_url = urljoin('https://grayestudio.com', rel_url)
    b_links_list.append(full_url)

In [31]:
# get product name

for url in b_links_list:
    b_url = url
    b_res = requests.get(b_url)
    b_soup = BeautifulSoup(b_res.content, 'lxml')

    product_text = b_soup.find('h1', {'class':'product__title heading-size-8'}).text.strip() 
    product.append(product_text)

In [32]:
# get material composition

for url in b_links_list:
    b_url = url
    b_res = requests.get(b_url)
    b_soup = BeautifulSoup(b_res.content, 'lxml')

    for a in b_soup.find_all('div', {'class':'tab-content tab-content-1'}):
        anchor_tag = a.find('span', {'class':'metafield-multi_line_text_field'} )
        if anchor_tag:
            text = anchor_tag.text.strip()
            material.append(text)

In [33]:
# get care instructions

for url in b_links_list:
    b_url = url
    b_res = requests.get(b_url)
    b_soup = BeautifulSoup(b_res.content, 'lxml')

    for a in b_soup.find_all('div', {'class':'tab-content tab-content-4'}):
        anchor_tag = a.find('span', {'class':'metafield-multi_line_text_field'} )
        if anchor_tag:
            text = anchor_tag.text.strip()
            wash_care.append(text)

In [34]:
print(len(product))
print(len(material))
print(len(wash_care))

94
94
94


In [35]:
graye = pd.DataFrame({'product': product,
                      'material': material,
                      'wash_care': wash_care})

graye

Unnamed: 0,product,material,wash_care
0,V Ribbed Tee - Mint Green,100% Cotton,Machine Wash Cool Do not bleach Do not tumble ...
1,V Ribbed Tee - Space Blue,100% Cotton,Machine Wash Cool Do not bleach Do not tumble ...
2,Cupro tee - Stone,49.8% Cupro\n46.1% Cotton\n4.1% Lycra\nDouble ...,Machine Wash Cool\nDo Not Bleach\nDo Not Tumbl...
3,Cupro Tee - Brick,49.8% Cupro\n46.1% Cotton\n4.1% Lycra\nDouble ...,Machine Wash Cool\nDo Not Bleach\nDo Not Tumbl...
4,Cupro Tee - Jade Green,49.8% Cupro\n46.1% Cotton\n4.1% Lycra \nDouble...,Machine Wash Cool\nDo Not Bleach\nDo Not Tumbl...
...,...,...,...
89,Unisex Boxer Shorts - Camel,100% Cotton Poplin,Machine Wash Cool\nDo Not Bleach\nDo Not Tumbl...
90,Unisex Boxer Shorts - Ivory,100% Cotton Poplin,Machine Wash Cool\nDo Not Bleach\nDo Not Tumbl...
91,Elasticated Workwear Pants,100% Cotton,Machine Wash Cool\nDo Not Bleach\nDo Not Tumbl...
92,Detachable Suspender Trousers,100% Cotton,Machine Wash Cool\nWash With Similar Colours\n...


In [36]:
# save as csv

graye.to_csv('data/graye_og.csv', index=False)