In [None]:
# Install these packages:
# pip install beautifulsoup4
# pip install pandas

In [70]:
from bs4 import BeautifulSoup
import requests # url --> html file

import pandas as pd
import re # regex

#### scraping from an HTML file

In [78]:
# set html file to `file`
with open('example.html') as file:
    soup = BeautifulSoup(file,'html.parser')
    # print(soup)
    divs = soup.find_all('div') # find all divs in our html file
    # print(divs)
    print([div.find('p').text for div in divs])

['some text', 'some more text']


#### scraping from the Web

with BeautifulSoup

In [80]:
url = 'https://en.wikipedia.org/wiki/2018_NBA_draft'
request = requests.get(url) # get the HTML from the url
# print(request.content)

soup = BeautifulSoup(request.content,'html.parser')
print(soup.prettify()) # not the best place to view the html --> use inspector on browser

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   2018 NBA draft - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled";(function(){var cookie=document.cookie.match(/(?:^|; 

In [91]:
players = soup.find_all('table',{'class':'wikitable'})[2]
rows = players.find_all('tr') 
# print(rows)
data = [[re.sub('\*|\+|~','',cell.text.strip()) for cell in row.find_all('td')] for row in rows][1:]

In [53]:
re.sub('\*|~','','*~')

''

In [92]:
player_df = pd.DataFrame(data=data,columns=[
    'Rnd.',
    'Pick',
    'Player',
    'Pos.',
    'Nationality',
    'Team',
    'School'
    ])
player_df.head()

Unnamed: 0,Rnd.,Pick,Player,Pos.,Nationality,Team,School
0,1,1,Deandre Ayton,C,Bahamas,Phoenix Suns,Arizona (Fr.)
1,1,2,Marvin Bagley III,PF,United States,Sacramento Kings,Duke (Fr.)
2,1,3,Luka Dončić,PG,Slovenia,Atlanta Hawks (traded to Dallas)[a],Real Madrid (Spain)
3,1,4,Jaren Jackson Jr.,PF,United States,Memphis Grizzlies,Michigan State (Fr.)
4,1,5,Trae Young,PG,United States,Dallas Mavericks (traded to Atlanta)[a],Oklahoma (Fr.)


In [93]:
player_df.to_csv('players.csv',index=False)

with pd.read_html()
- requires data to be in a table (`<tbody>`) format to work well

In [95]:
# example of data in a table
# pip install html5lib
url = 'https://en.wikipedia.org/wiki/2018_NBA_draft'
tables = pd.read_html(url, match='Rnd.') # returns a *list* of all tables
# print(len(tables))
tables[0].head()

Unnamed: 0,.mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}Rnd.,Pick,Player,Pos.,Nationality[n 1],Team,School / club team
0,1,1,Deandre Ayton,C,Bahamas,Phoenix Suns,Arizona (Fr.)
1,1,2,Marvin Bagley III,PF,United States,Sacramento Kings,Duke (Fr.)
2,1,3,Luka Dončić*~,PG,Slovenia,Atlanta Hawks (traded to Dallas)[a],Real Madrid (Spain)
3,1,4,Jaren Jackson Jr.+,PF,United States,Memphis Grizzlies,Michigan State (Fr.)
4,1,5,Trae Young*,PG,United States,Dallas Mavericks (traded to Atlanta)[a],Oklahoma (Fr.)


#### scraping from the Web (Ex. 2)

In [127]:
# example of data that's "in" a table, but not super well formatted
# pip install lxml
url = 'https://www.imdb.com/chart/moviemeter'
imdb = pd.read_html(url)
imdb_df = imdb[0]
imdb_df.head()
# imdb_df.iloc[:,1:4].head()

Unnamed: 0.1,Unnamed: 0,Rank & Title,IMDb Rating,Your Rating,Unnamed: 4
0,,Spider-Man: Across the Spider-Verse (2023) 1...,9.0,12345678910 NOT YET RELEASED Seen,
1,,The Little Mermaid (2023) 2 ( 1),7.2,12345678910 NOT YET RELEASED Seen,
2,,Transformers: Rise of the Beasts (2023) 3 ( 24),6.6,12345678910 NOT YET RELEASED Seen,
3,,Expend4bles (2023) 4 ( 62),,12345678910 NOT YET RELEASED Seen,
4,,Spider-Man: Into the Spider-Verse (2018) 5 (...,8.4,12345678910 NOT YET RELEASED Seen,


What do we want to scrape?
Top 100 movies as of today
- movie poster (link to image)
- title
- rating

- AND cast (this one's the toughy)

Scraping strategy
- look for unique identifiers (tag, id, class...)
- try grabbing one element to make sure it works
- grab the rest

In [128]:
request = requests.get(url) # get the HTML from the url
soup = BeautifulSoup(request.content,'html.parser')

In [134]:
# get images
imdb_table = soup.find('tbody',class_='lister-list')
# imdb_table
posters = imdb_table.find_all('td','posterColumn') # returns a list
# posters
img_srcs = [p.find('img').get('src').split('._')[0] + '.jpg' for p in posters]
# img_srcs = [p.find('img').get('src').split('_')[0] + 'jpg' for p in posters]
img_srcs[:3]

['https://m.media-amazon.com/images/M/MV5BNzQ1ODUzYjktMzRiMS00ODNiLWI4NzQtOTRiN2VlNTNmODFjXkEyXkFqcGdeQXVyMTkxNjUyNQ@@.jpg',
 'https://m.media-amazon.com/images/M/MV5BYTUxYjczMWUtYzlkZC00NTcwLWE3ODQtN2I2YTIxOTU0ZTljXkEyXkFqcGdeQXVyMTkxNjUyNQ@@.jpg',
 'https://m.media-amazon.com/images/M/MV5BZTNiNDA4NmMtNTExNi00YmViLWJkMDAtMDAxNmRjY2I2NDVjXkEyXkFqcGdeQXVyMDM2NDM2MQ@@.jpg']

In [137]:
# get title, year, rank,
titleColumn = imdb_table.find_all('td','titleColumn')
# titleColumn
titles = [title.find('a').text for title in titleColumn]
# titles[:3]
years = [title.find(class_='secondaryInfo').text.replace('(','').replace(')','') for title in titleColumn]
years[:3]
# rank and rating are left as exercise

['2023', '2023', '2023']

In [None]:
# get cast?