In [1]:
# imports
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re

# Basics

In [None]:
URL = "https://en.wikipedia.org/wiki/Questrom_School_of_Business"

In [None]:
resp = requests.get(URL)

In [None]:
resp.status_code

In [None]:
resp.text

In [None]:
# soup
soup = BeautifulSoup(resp.text, 'html.parser')

In [None]:
# title
title = soup.find('title')
title.text

In [None]:
soup.title.text 

In [None]:
# body
soup.body.text

In [None]:
body = soup.body.text
body.strip().replace("\n", " ")

In [None]:
# search for the body
body = soup.find('body')
body

In [19]:
# look for the paragraphs
pars = soup.find_all("p")

In [21]:
type(pars)

bs4.element.ResultSet

In [22]:
paragraphs = []
for p in pars:
    paragraphs.append(p.text)

In [23]:
paragraphs

['The Questrom School of Business (formerly, the Boston University School of Management) is the business school at Boston University in Boston, MA, USA. Founded in 1913 as the College of Business Administration, the school offers undergraduate and graduate programs.\n',
 'The BU Questrom School of Business offers a Bachelor of Science in Business Administration (BSBA), Master of Business Administration (MBA) degree (full- and part-time programs), a Master of Science (MS) in Mathematical Finance, a Master of Science in Management Studies (MSMS), executive education programs, and two Ph.D. programs. Both the undergraduate and graduate programs offer dual degree options with other schools and colleges at Boston University.\n',
 'Questrom has some 250 full-time faculty and some 200 part-time faculty, teaching fellows, and active research assistants.[1]\n',
 'October 13, 1913, Boston University business began classes for students in the College of Business Administration. The first three co

In [24]:
len(paragraphs)

20

In [25]:
" ".join(paragraphs)

"The Questrom School of Business (formerly, the Boston University School of Management) is the business school at Boston University in Boston, MA, USA. Founded in 1913 as the College of Business Administration, the school offers undergraduate and graduate programs.\n The BU Questrom School of Business offers a Bachelor of Science in Business Administration (BSBA), Master of Business Administration (MBA) degree (full- and part-time programs), a Master of Science (MS) in Mathematical Finance, a Master of Science in Management Studies (MSMS), executive education programs, and two Ph.D. programs. Both the undergraduate and graduate programs offer dual degree options with other schools and colleges at Boston University.\n Questrom has some 250 full-time faculty and some 200 part-time faculty, teaching fellows, and active research assistants.[1]\n October 13, 1913, Boston University business began classes for students in the College of Business Administration. The first three courses were En

---

# Scraping Links

In [26]:
soup = BeautifulSoup(resp.text, 'html.parser')

In [27]:
links = soup.find_all("a")

In [29]:
links[:10]

[<a id="top"></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#searchInput">Jump to search</a>,
 <a class="image" href="/wiki/File:BU_School_of_Management.JPG"><img alt="BU School of Management.JPG" data-file-height="1000" data-file-width="1474" decoding="async" height="170" src="//upload.wikimedia.org/wikipedia/commons/thumb/7/74/BU_School_of_Management.JPG/250px-BU_School_of_Management.JPG" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/7/74/BU_School_of_Management.JPG/375px-BU_School_of_Management.JPG 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/7/74/BU_School_of_Management.JPG/500px-BU_School_of_Management.JPG 2x" width="250"/></a>,
 <a class="mw-redirect" href="/wiki/Private_school" title="Private school">Private</a>,
 <a href="/wiki/Dean_(education)" title="Dean (education)">Dean</a>,
 <a href="/wiki/Undergraduate_education" title="Undergraduate education">Undergraduates</a>,
 <a href="/wiki/Postgraduate_educati

In [30]:
for link in links:
    link['href']

KeyError: 'href'

In [31]:
# list comprehensions
links_parsed = [link['href'] for link in links if link.has_attr("href")]
links_parsed[:5]

['#mw-head',
 '#searchInput',
 '/wiki/File:BU_School_of_Management.JPG',
 '/wiki/Private_school',
 '/wiki/Dean_(education)']

In [32]:
link_text = [link.text for link in links]
link_text[:5]

['', 'Jump to navigation', 'Jump to search', '', 'Private']

In [33]:
len(links_parsed) == len(link_text)

False

In [35]:
# tuple
link_data = [(link.text, link['href']) for link in links if link.has_attr("href")]
link_data[:5]

[('Jump to navigation', '#mw-head'),
 ('Jump to search', '#searchInput'),
 ('', '/wiki/File:BU_School_of_Management.JPG'),
 ('Private', '/wiki/Private_school'),
 ('Dean', '/wiki/Dean_(education)')]

In [37]:
link_data[:20]

[('Jump to navigation', '#mw-head'),
 ('Jump to search', '#searchInput'),
 ('', '/wiki/File:BU_School_of_Management.JPG'),
 ('Private', '/wiki/Private_school'),
 ('Dean', '/wiki/Dean_(education)'),
 ('Undergraduates', '/wiki/Undergraduate_education'),
 ('Postgraduates', '/wiki/Postgraduate_education'),
 ('Boston', '/wiki/Boston'),
 ('Massachusetts', '/wiki/Massachusetts'),
 ('United States', '/wiki/United_States'),
 ('Urban', '/wiki/Urban_area'),
 ('Questrom Website', 'http://www.bu.edu/questrom/'),
 ('business school', '/wiki/Business_school'),
 ('Boston University', '/wiki/Boston_University'),
 ('Boston', '/wiki/Boston'),
 ('undergraduate', '/wiki/Undergraduate_education'),
 ('graduate', '/wiki/Graduate_education'),
 ('Bachelor of Science in Business Administration',
  '/wiki/Bachelor_of_Science_in_Business_Administration'),
 ('Master of Business Administration',
  '/wiki/Master_of_Business_Administration'),
 ('Master of Science', '/wiki/Master_of_Science')]

In [40]:
# lets only pull links that are http*
http_links = soup.find_all("a", attrs={'href': re.compile("^http.*")})
type(http_links)

bs4.element.ResultSet

In [41]:
http_urls = [links['href'] for links in http_links]
http_urls[:5]

['http://www.bu.edu/questrom/',
 'http://management.bu.edu/about/facts/index.html',
 'https://web.archive.org/web/20060515211324/http://management.bu.edu/about/facts/index.html',
 'https://web.archive.org/web/20160304025610/http://www.bu.edu/interactive-design/2013/09/22/school-of-management-centennial/',
 'http://www.bu.edu/interactive-design/2013/09/22/school-of-management-centennial/']

# Utilities, Parsing Web Tables, and Downloading Files

In [2]:
URL = "https://en.wikipedia.org/wiki/List_of_Super_Bowl_champions"

In [3]:
tables = pd.read_html(URL)

In [4]:
type(tables)

list

In [5]:
len(tables)

16

In [8]:
superbowl = tables[1]

In [9]:
superbowl.head(3)

Unnamed: 0,Game,Date/Season,Winning team,Score,Losing team,Venue,City,Attendance,Ref
0,I[sb 1],"January 15, 1967 (1966 AFL/1966 NFL)","Green Bay Packersn(1, 1–0)",35–10,"Kansas City Chiefsa(1, 0–1)",Los Angeles Memorial Coliseum,"Los Angeles, California[sb 2]",61946,[12][13]
1,II[sb 1],"January 14, 1968 (1967 AFL/1967 NFL)","Green Bay Packersn(2, 2–0)",33–14,"Oakland Raidersa(1, 0–1)",Miami Orange Bowl,"Miami, Florida[sb 3]",75546,[14][13]
2,III[sb 1],"January 12, 1969 (1968 AFL/1968 NFL)","New York Jetsa(1, 1–0)",16–7,"Baltimore Coltsn(1, 0–1)",Miami Orange Bowl (2),"Miami, Florida (2)[sb 3]",75389,[15][13]


## Download data the from web


> This is not a notebook specific action

In [None]:
URL = "https://vincentarelbundock.github.io/Rdatasets/csv/ggplot2/diamonds.csv"

> wget

In [10]:
! wget https://vincentarelbundock.github.io/Rdatasets/csv/ggplot2/diamonds.csv

--2020-10-19 15:22:19--  https://vincentarelbundock.github.io/Rdatasets/csv/ggplot2/diamonds.csv
Resolving vincentarelbundock.github.io (vincentarelbundock.github.io)... 185.199.109.153, 185.199.108.153, 185.199.111.153, ...
Connecting to vincentarelbundock.github.io (vincentarelbundock.github.io)|185.199.109.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3192560 (3.0M) [text/csv]
Saving to: ‘diamonds.csv’


2020-10-19 15:22:20 (5.96 MB/s) - ‘diamonds.csv’ saved [3192560/3192560]

