# Scrape across a web page

How do you scrape data from across a webpage and not just in a table?  This is an introduction. 

### Read in the webpage

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd

# What web page do we want? 
pageAddress = "https://www.pro-football-reference.com/players/injuries.htm"

# Open and read in the web page 
soup = BeautifulSoup(urlopen(pageAddress), "html.parser")

### Print the HTML (source HTML for the webpage)

In [2]:
# Print all the HTML, if you want to reference it
#print(soup.prettify())

### Use Pandas to read a table of data.

In [6]:
# Read the table of data into a Dataframe 
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))
print(df[0])

               Player   Tm  Pos         Type         Class  \
0          Jeff Badet  ATL   WR          Leg  questionable   
1       Deadrin Senat  ATL   DT   Upper Body           I-R   
2           Matt Gono  ATL    T         Neck           I-R   
3       Willie Wright  ATL   OL      Illness           I-R   
4         John Atkins  ATL   DT      Illness           I-R   
..                ...  ...  ...          ...           ...   
157   Cornelius Lucas  WAS   OT      Illness           I-R   
158      Greg Stroman  WAS   CB  Undisclosed  questionable   
159  Deshazor Everett  WAS  SAF  Undisclosed           I-R   
160     Curtis Samuel  WAS   WR        Groin  questionable   
161    Landon Collins  WAS  SAF     Achilles  questionable   

                                               Details  
0    Badet is nursing a lower-leg injury, and it ha...  
1    Senat as been added to the active PUP list wit...  
2    Gono has been ticketed for the PUP list while ...  
3    Wright has landed on t

In [15]:
df.to_csv('2021_pfr_injuries.csv', index=False)

### Find all the links in the page (which have a tag of <a ...>)

In [4]:
# Find all the links in the webpage 
links = soup.find_all('a')

for link in links:
    names = link.contents[0]
    fullLink = link.get('href')
    print(fullLink)

#main-content
https://www.nfl.com
https://www.nfl.com/news/
https://www.nfl.com/scores/
https://www.nfl.com/schedules/
https://www.nfl.com/videos/
https://www.nfl.com/teams/
https://www.nfl.com/players/
https://www.nfl.com/stats/player-stats/
https://www.nfl.com/standings/
https://www.nfl.com/draft/
#2ndlevel
https://www.nfl.com/photos/
https://www.nfl.com/super-bowl/
https://www.nfl.com/gamepass?icampaign=gpg-nav-gno-gamepass
https://www.nfl.com/free-agency/
https://www.nfl.com/ways-to-watch/
https://www.foxdeportes.com/nfl/
https://www.nfl.com/causes/inspire-change/
https://www.nfl.com/network/watch/nfl-network-live
https://smart.link/qd5unmrz3lfwv
https://www.ticketmaster.com/nfl?wt.mc_id=NFL_LEAGUE_TICKETS_LINK&utm_source=NFL.com&utm_medium=client&utm_campaign=NFL_LEAGUE&utm_content=TICKETS_LINK
http://www.nflshop.com/?bm-nflcom-2017-Header-Shop-Tab
https://www.nfl.com/account/sign-in
https://www.nflshop.com/?_s=bm-nflcom-2017-Header-Shop-Tab-Peak
https://www.nfl.com/network/watch/

### Find specific types (classes) of links

In [5]:
# Find only the links to players
# You view the source of the HTML page to get the class. 
links = soup.find_all('a', class_="d3-o-player-fullname nfl-o-cta--link")

for link in links:
    names = link.contents[0]
    fullLink = link.get('href')
    print(fullLink)

/players/jameis-winston/
/players/dak-prescott/
/players/jared-goff/
/players/philip-rivers/
/players/matt-ryan/
/players/russell-wilson/
/players/tom-brady/
/players/derek-carr/
/players/carson-wentz/
/players/patrick-mahomes/
/players/aaron-rodgers/
/players/jimmy-garoppolo/
/players/deshaun-watson/
/players/baker-mayfield/
/players/kyler-murray/
/players/kirk-cousins/
/players/ryan-fitzpatrick/
/players/andy-dalton/
/players/kyle-allen/
/players/gardner-minshew/
/players/mitchell-trubisky/
/players/lamar-jackson/
/players/josh-allen-4/
/players/daniel-jones/
/players/sam-darnold/
