In [1]:
# the main library you will need for webscraping is called Beautiful Soup
from bs4 import BeautifulSoup

# the second package we will need we already know it
import requests

In [2]:
url = "https://en.wikipedia.org/wiki/Marie_Curie"

response = requests.get(url)
response

<Response [200]>

In [3]:
# when you make your request to the webpage, if successfull it should return the full html content of that page
response.content

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Marie Curie - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"b6a86d1b-f8ab-48c9-b7d2-9784cc451435","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Marie_Curie","wgTitle":"Marie Curie","wgCurRevisionId":1117641602,"wgRevisionId":1117641602,"wgArticleId":20408,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 Polish-language sources (pl)","CS1: Julian\xe2\x80\x93Gregorian uncertainty","Webarchive template wayback links","CS1 French-language sources (fr)","A

In [4]:
# turning the response into a beautiful soup object
soup = BeautifulSoup(response.content)

In [5]:
type(soup)

bs4.BeautifulSoup

In [6]:
# prettify the soup to then copy it to a text editor and study its structure
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Marie Curie - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"b6a86d1b-f8ab-48c9-b7d2-9784cc451435","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Marie_Curie","wgTitle":"Marie Curie","wgCurRevisionId":1117641602,"wgRevisionId":1117641602,"wgArticleId":20408,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 Polish-language sources (pl)","CS1: Julian–Gregorian uncertainty","Webarchive template wayback links","CS1 French-language sources (fr)","A

In [7]:
# now that we have the html code inside a soup object -> we can explore it's attributes
# I can call the title tag of the webpage -> this brings the tag and the content
soup.title

<title>Marie Curie - Wikipedia</title>

In [8]:
# imagine you only wanted the content
soup.title.string

'Marie Curie - Wikipedia'

In [9]:
# imagine I want paragraphs (p tag)
soup.p
# this is no good, clearly there are many p tags which we want

<p class="mw-empty-elt">
</p>

In [10]:
# whenever we want to find all the tags of a specific type we use the find_all method

paragraphs = soup.find_all('p')
paragraphs

[<p class="mw-empty-elt">
 </p>,
 <p><b>Marie Salomea Skłodowska–Curie</b> (<span class="rt-commentedText nowrap"><span class="IPA nopopups noexcerpt" lang="en-fonipa"><a href="/wiki/Help:IPA/English" title="Help:IPA/English">/<span style="border-bottom:1px dotted"><span title="/ˈ/: primary stress follows">ˈ</span><span title="'k' in 'kind'">k</span><span title="/j/: 'y' in 'yes'">j</span><span title="/ʊər/: 'our' in 'tour'">ʊər</span><span title="/i/: 'y' in 'happy'">i</span></span>/</a></span></span> <a href="/wiki/Help:Pronunciation_respelling_key" title="Help:Pronunciation respelling key"><i title="English pronunciation respelling"><span style="font-size:90%">KURE</span>-ee</i></a>,<sup class="reference" id="cite_ref-4"><a href="#cite_note-4">[4]</a></sup> <small>French pronunciation: ​</small><span class="IPA" lang="fr-Latn-fonipa" title="Representation in the International Phonetic Alphabet (IPA)"><a href="/wiki/Help:IPA/French" title="Help:IPA/French">[maʁi kyʁi]</a></span>, <sm

In [11]:
# what kind of object was returned here?
# behaves like a list for many purposes BUT its still very powerful because it still "knows"
# the html structure of the remaining tags
type(paragraphs)

bs4.element.ResultSet

In [12]:
# can we loop over it?
# the text method allows you to return the text component of the element in this list

for element in paragraphs:
  print(element.text)  



Marie Salomea Skłodowska–Curie (/ˈkjʊəri/ KURE-ee,[4] French pronunciation: ​[maʁi kyʁi], Polish pronunciation: [ˈmarja skwɔˈdɔfska kʲiˈri]; born Maria Salomea Skłodowska, Polish: [ˈmarja salɔˈmɛa skwɔˈdɔfska]; 7 November 1867 – 4 July 1934) was a Polish and naturalized-French physicist and chemist who conducted pioneering research on radioactivity. She was the first woman to win a Nobel Prize, the first person and the only woman to win the Nobel Prize twice, and the only person to win the Nobel Prize in two scientific fields. Her husband, Pierre Curie, was a co-winner on her first Nobel Prize, making them the first ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes. She was, in 1906, the first woman to become a professor at the University of Paris.[5]

She was born in Warsaw, in what was then the Kingdom of Poland, part of the Russian Empire. She studied at Warsaw's clandestine Flying University and began her practical scientific tr

In [13]:
# you can search both by the tag but also by other attributes, such as the class name
tables = soup.find_all('table', attrs= {'class' : 'infobox biography vcard'})

#this is very helpful to identify boxes that use the same css styling, for which an attrivute is already defined

In [14]:
# finds all the text elements inside the table
table = tables[0]
table

<table class="infobox biography vcard"><tbody><tr><th class="infobox-above" colspan="2" style="font-size:125%;"><div class="fn" style="display:inline">Marie Curie</div></th></tr><tr><td class="infobox-image" colspan="2"><a class="image" href="/wiki/File:Marie_Curie_c._1920s.jpg"><img alt="Marie Curie c. 1920s.jpg" data-file-height="2105" data-file-width="1549" decoding="async" height="299" src="//upload.wikimedia.org/wikipedia/commons/thumb/c/c8/Marie_Curie_c._1920s.jpg/220px-Marie_Curie_c._1920s.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/c/c8/Marie_Curie_c._1920s.jpg/330px-Marie_Curie_c._1920s.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/c/c8/Marie_Curie_c._1920s.jpg/440px-Marie_Curie_c._1920s.jpg 2x" width="220"/></a><div class="infobox-caption">Curie <abbr title="circa">c.</abbr><span style="white-space:nowrap;"> 1920</span></div></td></tr><tr><th class="infobox-label" scope="row">Born</th><td class="infobox-data"><div class="nickname" style="display:inl

In [15]:
# inside the first level of my table, there are still many many tags
# you can find more tags within your table

# the table itself has many tags inside -> it is a soup object itself
for line in table.find_all('li'):
  print(line.text)

Poland (by birth)
France (by marriage)
University of Paris
ESPCI[3]
Pioneering research on radioactivity
Discovering polonium and radium
Irène
Ève
Nobel Prize in Physics (1903)
Davy Medal (1903)
Matteucci Medal (1904)
Actonian Prize (1907)
Elliott Cresson Medal (1909)
Albert Medal (1910)
Nobel Prize in Chemistry (1911)
Willard Gibbs Award (1921)
Cameron Prize for Therapeutics of the University of Edinburgh (1931)
Physics
chemistry
University of Paris
Institut du Radium
Institut du Radium
École Normale Supérieure
French Academy of Medicine
International Committee on Intellectual Cooperation
André-Louis Debierne
Ladislas Goldstein
Émile Henriot
Irène Joliot-Curie
Óscar Moreno
Marguerite Perey
Francis Perrin


In [None]:
# do it yourself:
# find all the bio fields category names for Mdme Curie

In [16]:
url = "https://en.wikipedia.org/wiki/Marie_Curie"

response = requests.get(url)
soup = BeautifulSoup(response.content)
tables = soup.find_all('table', attrs= {'class' : 'infobox biography vcard'})
for line in tables[0].find_all('th'):
  print(line.text)

Marie Curie
Born
Died
Cause of death
Citizenship
Alma mater
Known for
Spouse
Children
Awards
Fields
Institutions
Thesis
Doctoral advisor
Doctoral students
Signature
Notes


## Problems and respectful scraping

In [None]:
# problems you may have

# your IP getting blocked by the site

# the webpage might not be loading all the content in a static way

In [17]:
import time
time.sleep(5)
print("took me a while to show up...")

took me a while to show up...


In [None]:
# also, robots.txt
#https://www.lidl.com/robots.txt
#https://www.walmart.com/robots.txt
#https://www.nike.com/robots.txt

## Complex scraping use case

In [18]:
# webscrape farfetch -> generate a dataframe with the information of all items of prada

# get url
url = "https://www.farfetch.com/pt/shopping/men/prada/items.aspx"
#make the request
response = requests.get(url)

In [19]:
response.status_code

403

In [20]:
#headers allow you to "pretend" to be a web browser
headers= {'Accept-Encoding':'gzip, deflate',
          'Accept-Language':'en-US,en;q=0.9',
          'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'}

In [21]:
response = requests.get(url, headers = headers)
response.status_code

200

In [22]:
#convert it to soup

soup = BeautifulSoup(response.content)
soup 

<!DOCTYPE html>
<html dir="LTR" lang="en"><head><title>Prada for Men - Shop New Arrivals - FARFETCH</title><meta content="Shop the latest arrivals from Prada on FARFETCH. Enjoy climate conscious ✈ delivery &amp; free returns. Get the best designer pieces delivered to your door." name="description"/>
<link href="https://dd6zx4ibq538k.cloudfront.net" rel="preconnect"/>
<link href="https://cdn-static.farfetch-contents.com" rel="preconnect"/>
<link href="https://cdn-images.farfetch-contents.com" rel="preconnect"/>
<link href="https://www.googletagmanager.com" rel="preconnect"/>
<link href="https://www.google-analytics.com" rel="preconnect"/>
<link href="https://securepubads.g.doubleclick.net" rel="preconnect"/>
<link href="https://pagead2.googlesyndication.com" rel="preconnect"/>
<link href="https://c.bannerflow.net" rel="preconnect"/>
<link href="https://s.go-mpulse.net" rel="preconnect"/>
<link href="https://adservice.google.com" rel="preconnect"/>
<link href="https://www.farfetch.com/pt

In [23]:
products = soup.find_all('div', attrs = {'data-component' : 'ProductCard'})

# here we get all the products as a list 
products

[<div class="ltr-x69rqn e19e7out0" data-component="ProductCard" itemid="/pt/shopping/men/prada-logo-print-shirt-jacket-item-18917323.aspx?storeid=9800"><style data-emotion="ltr 1gxq4h9">.ltr-1gxq4h9{grid-row:1/3;}</style><a aria-label="Prada: logo-print shirt jacket" class="ltr-1gxq4h9 e4l1wga0" data-component="ProductCardLink" href="/pt/shopping/men/prada-logo-print-shirt-jacket-item-18917323.aspx?storeid=9800" target="_self"><style data-emotion="ltr 15vtm2m">.ltr-15vtm2m [data-component='LabelSecondaryLight'],.ltr-15vtm2m [data-component='LabelSecondaryDark']{position:absolute;top:1rem;max-width:calc(100% - 4.4rem);pointer-events:none;-webkit-transition:opacity var(--motion-functional-duration-m) var(--motion-functional-easing-standard);transition:opacity var(--motion-functional-duration-m) var(--motion-functional-easing-standard);}@media (min-width: 20em){.ltr-15vtm2m [data-component='LabelSecondaryLight'],.ltr-15vtm2m [data-component='LabelSecondaryDark']{max-width:calc(100% - 4.4r

In [25]:
len(products)

12

In [26]:
# but from a given product we just want to get the url to the main page

product = products[1]
product
# product is still a soup object
#inside of a tag you get attributes like in a dictionary
product.a['href']


url = "https://www.farfetch.com" + product.a['href']

In [27]:
url

'https://www.farfetch.com/pt/shopping/men/prada-stretch-poplin-long-sleeve-shirt-item-18698404.aspx?storeid=9636'

In [28]:
# if To build a list with all the products
url_list = ["https://www.farfetch.com" + product.a['href'] for product in products]

In [29]:
url_list

['https://www.farfetch.com/pt/shopping/men/prada-logo-print-shirt-jacket-item-18917323.aspx?storeid=9800',
 'https://www.farfetch.com/pt/shopping/men/prada-stretch-poplin-long-sleeve-shirt-item-18698404.aspx?storeid=9636',
 'https://www.farfetch.com/pt/shopping/men/prada-district-metallic-effect-sneakers-item-18750293.aspx?storeid=12113',
 'https://www.farfetch.com/pt/shopping/men/prada-straight-leg-press-stud-cotton-trousers-item-18831329.aspx?storeid=12113',
 'https://www.farfetch.com/pt/shopping/men/prada-engraved-logo-woven-belt-item-18851388.aspx?storeid=9260',
 'https://www.farfetch.com/pt/shopping/men/prada-shetland-wool-turtleneck-sweater-item-18732574.aspx?storeid=12113',
 'https://www.farfetch.com/pt/shopping/men/prada-brushed-leather-lace-up-shoes-item-18211832.aspx?storeid=9868',
 'https://www.farfetch.com/pt/shopping/men/prada-logo-plaque-trousers-item-18700064.aspx?storeid=9359',
 'https://www.farfetch.com/pt/shopping/men/prada-re-nylon-quilted-bucket-hat-item-18963351.as

In [30]:
# now that we have all the urls for all the clothes, we have to understand how to get
# the information of one item

# got the url
item_url = 'https://www.farfetch.com/pt/shopping/men/prada-stretch-poplin-long-sleeve-shirt-item-18698404.aspx?storeid=9636'

# make the request

response = requests.get(item_url,headers=headers)

soup = BeautifulSoup(response.content)

In [32]:
soup

<!DOCTYPE html>
<html dir="LTR" lang="en"><head><title>Prada Stretch Poplin long-sleeve Shirt - Farfetch</title><meta content="Shop Prada stretch poplin long-sleeve shirt" name="description"/>
<link href="https://dd6zx4ibq538k.cloudfront.net" rel="preconnect"/>
<link href="https://cdn-static.farfetch-contents.com" rel="preconnect"/>
<link href="https://cdn-images.farfetch-contents.com" rel="preconnect"/>
<link href="https://www.googletagmanager.com" rel="preconnect"/>
<link href="https://www.google-analytics.com" rel="preconnect"/>
<link href="https://securepubads.g.doubleclick.net" rel="preconnect"/>
<link href="https://pagead2.googlesyndication.com" rel="preconnect"/>
<link href="https://c.bannerflow.net" rel="preconnect"/>
<link href="https://s.go-mpulse.net" rel="preconnect"/>
<link href="https://adservice.google.com" rel="preconnect"/>
<link href="https://www.farfetch.com/pt/shopping/men/prada-stretch-poplin-long-sleeve-shirt-item-18698404.aspx" rel="canonical"/>
<link href="https

In [37]:
# within the soup I want to find the price

result = soup.find_all('p', attrs={'data-component' : 'PriceLarge'})
result
price = result[0].text
print(price)

636 €


In [38]:
result = soup.find_all('p', attrs={'class' : 'ltr-13ze6d5-Body efhm1m90'})
name = result[0].text
print(name)

stretch poplin long-sleeve shirt


In [39]:
{'name' : name, 'price' : price, 'url' : item_url}

{'name': 'stretch poplin long-sleeve shirt',
 'price': '636 €',
 'url': 'https://www.farfetch.com/pt/shopping/men/prada-stretch-poplin-long-sleeve-shirt-item-18698404.aspx?storeid=9636'}

In [44]:
url_list

['https://www.farfetch.com//pt/shopping/men/prada-logo-print-shirt-jacket-item-18917323.aspx?storeid=9800',
 'https://www.farfetch.com//pt/shopping/men/prada-stretch-poplin-long-sleeve-shirt-item-18698404.aspx?storeid=9636',
 'https://www.farfetch.com//pt/shopping/men/prada-district-metallic-effect-sneakers-item-18750293.aspx?storeid=12113',
 'https://www.farfetch.com//pt/shopping/men/prada-straight-leg-press-stud-cotton-trousers-item-18831329.aspx?storeid=12113',
 'https://www.farfetch.com//pt/shopping/men/prada-engraved-logo-woven-belt-item-18851388.aspx?storeid=9260',
 'https://www.farfetch.com//pt/shopping/men/prada-shetland-wool-turtleneck-sweater-item-18732574.aspx?storeid=12113',
 'https://www.farfetch.com//pt/shopping/men/prada-brushed-leather-lace-up-shoes-item-18211832.aspx?storeid=9868',
 'https://www.farfetch.com//pt/shopping/men/prada-logo-plaque-trousers-item-18700064.aspx?storeid=9359',
 'https://www.farfetch.com//pt/shopping/men/prada-re-nylon-quilted-bucket-hat-item-18

In [46]:
# got this for one item, now we loop for all

#lets save the data as a list
url_list = ["https://www.farfetch.com/" + product.a['href'] for product in products]
data = []

for link in url_list[:5]:
  print(link)
  time.sleep(5)
  try:
    response = requests.get(link,headers=headers)
    soup = BeautifulSoup(response.content)

    # get the price
    result = soup.find_all('p', attrs={'data-component' : 'PriceLarge'})
    price = result[0].text

    # get the name
    result = soup.find_all('p', attrs={'class' : 'ltr-13ze6d5-Body efhm1m90'})
    name = result[0].text
    
    data.append({'name' : name, 'price' : price, 'url' : item_url})
  except :
    pass




https://www.farfetch.com//pt/shopping/men/prada-logo-print-shirt-jacket-item-18917323.aspx?storeid=9800
1,990 €
logo-print shirt jacket
https://www.farfetch.com//pt/shopping/men/prada-stretch-poplin-long-sleeve-shirt-item-18698404.aspx?storeid=9636
636 €
stretch poplin long-sleeve shirt
https://www.farfetch.com//pt/shopping/men/prada-district-metallic-effect-sneakers-item-18750293.aspx?storeid=12113
890 €
District metallic-effect sneakers
https://www.farfetch.com//pt/shopping/men/prada-straight-leg-press-stud-cotton-trousers-item-18831329.aspx?storeid=12113
1,450 €
straight-leg press stud cotton trousers
https://www.farfetch.com//pt/shopping/men/prada-engraved-logo-woven-belt-item-18851388.aspx?storeid=9260
420 €
engraved-logo woven belt


In [47]:
data

[{'name': 'logo-print shirt jacket',
  'price': '1,990 €',
  'url': 'https://www.farfetch.com/pt/shopping/men/prada-stretch-poplin-long-sleeve-shirt-item-18698404.aspx?storeid=9636'},
 {'name': 'stretch poplin long-sleeve shirt',
  'price': '636 €',
  'url': 'https://www.farfetch.com/pt/shopping/men/prada-stretch-poplin-long-sleeve-shirt-item-18698404.aspx?storeid=9636'},
 {'name': 'District metallic-effect sneakers',
  'price': '890 €',
  'url': 'https://www.farfetch.com/pt/shopping/men/prada-stretch-poplin-long-sleeve-shirt-item-18698404.aspx?storeid=9636'},
 {'name': 'straight-leg press stud cotton trousers',
  'price': '1,450 €',
  'url': 'https://www.farfetch.com/pt/shopping/men/prada-stretch-poplin-long-sleeve-shirt-item-18698404.aspx?storeid=9636'},
 {'name': 'engraved-logo woven belt',
  'price': '420 €',
  'url': 'https://www.farfetch.com/pt/shopping/men/prada-stretch-poplin-long-sleeve-shirt-item-18698404.aspx?storeid=9636'}]

In [48]:
import pandas as pd
pd.DataFrame(data)

Unnamed: 0,name,price,url
0,logo-print shirt jacket,"1,990 €",https://www.farfetch.com/pt/shopping/men/prada...
1,stretch poplin long-sleeve shirt,636 €,https://www.farfetch.com/pt/shopping/men/prada...
2,District metallic-effect sneakers,890 €,https://www.farfetch.com/pt/shopping/men/prada...
3,straight-leg press stud cotton trousers,"1,450 €",https://www.farfetch.com/pt/shopping/men/prada...
4,engraved-logo woven belt,420 €,https://www.farfetch.com/pt/shopping/men/prada...
