In [7]:
# Import scraping modules
from urllib.request import urlopen
from bs4 import BeautifulSoup
# Import data manipulation modules
import pandas as pd
import numpy as np
# Import data visualization modules
import matplotlib as mpl
import matplotlib.pyplot as plt


# URL of page
url = 'https://www.pro-football-reference.com/years/2019/passing.htm'
# Open URL and pass to BeautifulSoup
html = urlopen(url)
stats_page = BeautifulSoup(html)

#findAll(name)
#Parameters
#name -- HTML tags to use to parse webpage
#Returns array of all matches to name tag

#getText()
#Returns text from HTML

# Collect table headers
column_headers = stats_page.findAll('tr')[0]
column_headers = [i.getText() for i in column_headers.findAll('th')]

print(column_headers)

# Collect table rows
rows = stats_page.findAll('tr')[1:]
# Get stats from each row
qb_stats = []
for i in range(len(rows)):
    qb_stats.append([col.getText() for col in rows[i].findAll('td')])

print(qb_stats[0])

['Rk', 'Player', 'Tm', 'Age', 'Pos', 'G', 'GS', 'QBrec', 'Cmp', 'Att', 'Cmp%', 'Yds', 'TD', 'TD%', 'Int', 'Int%', '1D', 'Lng', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate', 'QBR', 'Sk', 'Yds', 'Sk%', 'NY/A', 'ANY/A', '4QC', 'GWD']
['Jameis Winston', 'TAM', '25', 'QB', '16', '16', '7-9-0', '380', '626', '60.7', '5109', '33', '5.3', '30', '4.8', '243', '71', '8.2', '7.1', '13.4', '319.3', '84.3', '55.7', '47', '282', '7.0', '7.17', '6.15', '2', '2']


In [8]:
# Create DataFrame from our scraped data
data = pd.DataFrame(qb_stats, columns=column_headers[1:])

# Examine first five rows of data
data.head()

Unnamed: 0,Player,Tm,Age,Pos,G,GS,QBrec,Cmp,Att,Cmp%,...,Y/G,Rate,QBR,Sk,Yds,Sk%,NY/A,ANY/A,4QC,GWD
0,Jameis Winston,TAM,25,QB,16,16,7-9-0,380,626,60.7,...,319.3,84.3,55.7,47,282,7.0,7.17,6.15,2.0,2.0
1,Dak Prescott,DAL,26,QB,16,16,8-8-0,388,596,65.1,...,306.4,99.7,71.9,23,151,3.7,7.68,7.84,,
2,Jared Goff,LAR,25,QB,16,16,9-7-0,394,626,62.9,...,289.9,86.5,50.6,22,170,3.4,6.9,6.46,1.0,2.0
3,Philip Rivers,LAC,38,QB,16,16,5-11-0,390,591,66.0,...,288.4,88.5,50.5,34,222,5.4,7.03,6.32,1.0,2.0
4,Matt Ryan,ATL,34,QB,15,15,7-8-0,408,616,66.2,...,297.7,92.1,59.6,48,316,7.2,6.25,6.08,3.0,2.0


In [9]:
# View columns in data
data.columns

Index(['Player', 'Tm', 'Age', 'Pos', 'G', 'GS', 'QBrec', 'Cmp', 'Att', 'Cmp%',
       'Yds', 'TD', 'TD%', 'Int', 'Int%', '1D', 'Lng', 'Y/A', 'AY/A', 'Y/C',
       'Y/G', 'Rate', 'QBR', 'Sk', 'Yds', 'Sk%', 'NY/A', 'ANY/A', '4QC',
       'GWD'],
      dtype='object')

In [10]:
# Rename sack yards column to `Yds_Sack`
new_columns = data.columns.values
new_columns[9] = 'Completion %'
new_columns[11] = 'Touchdowns'
new_columns[13] = 'Interceptions'
new_columns[10] = 'Yards'
new_columns[17] = 'Yards/Attempt'
data.columns = new_columns
data.columns


Index(['Player', 'Tm', 'Age', 'Pos', 'G', 'GS', 'QBrec', 'Cmp', 'Att',
       'Completion %', 'Yards', 'Touchdowns', 'TD%', 'Interceptions', 'Int%',
       '1D', 'Lng', 'Yards/Attempt', 'AY/A', 'Y/C', 'Y/G', 'Rate', 'QBR', 'Sk',
       'Yds', 'Sk%', 'NY/A', 'ANY/A', '4QC', 'GWD'],
      dtype='object')

In [11]:
data.head()

Unnamed: 0,Player,Tm,Age,Pos,G,GS,QBrec,Cmp,Att,Completion %,...,Y/G,Rate,QBR,Sk,Yds,Sk%,NY/A,ANY/A,4QC,GWD
0,Jameis Winston,TAM,25,QB,16,16,7-9-0,380,626,60.7,...,319.3,84.3,55.7,47,282,7.0,7.17,6.15,2.0,2.0
1,Dak Prescott,DAL,26,QB,16,16,8-8-0,388,596,65.1,...,306.4,99.7,71.9,23,151,3.7,7.68,7.84,,
2,Jared Goff,LAR,25,QB,16,16,9-7-0,394,626,62.9,...,289.9,86.5,50.6,22,170,3.4,6.9,6.46,1.0,2.0
3,Philip Rivers,LAC,38,QB,16,16,5-11-0,390,591,66.0,...,288.4,88.5,50.5,34,222,5.4,7.03,6.32,1.0,2.0
4,Matt Ryan,ATL,34,QB,15,15,7-8-0,408,616,66.2,...,297.7,92.1,59.6,48,316,7.2,6.25,6.08,3.0,2.0


In [12]:
# Select stat categories
categories = ['GS','Completion %', 'Yards', 'Touchdowns', 'Interceptions', 'Yards/Attempt']

# Create data subset for radar chart
data_radar = data[['Player', 'Tm'] + categories]
data_radar.head()

# Check data types
data_radar.dtypes

# Convert data to numerical values
pd.options.mode.chained_assignment = None  # default='warn'

for i in categories:
    data_radar[i] = pd.to_numeric(data[i])
    #data_radar = pd.to_numeric(data[i])
    
# Check data types
data_radar.dtypes

# Remove ornamental characters for achievements
data_radar['Player'] = data_radar['Player'].str.replace('*', '')
data_radar['Player'] = data_radar['Player'].str.replace('+', '')

  data_radar['Player'] = data_radar['Player'].str.replace('*', '')
  data_radar['Player'] = data_radar['Player'].str.replace('+', '')


In [13]:
# Filter by passing yards
data_radar_filtered = data_radar[data_radar['GS'] > 7]
data_radar_filtered = data_radar_filtered[data_radar_filtered['Yards'] > 400]

# Create columns with percentile rank
#for i in categories:
#    data_radar_filtered[i + '_Rank'] = data_radar_filtered[i].rank(pct=True)
# We need to flip the rank for interceptions
#data_radar_filtered['Int_Rank'] = 1 - data_radar_filtered['Int_Rank']

# Examine data
data_radar_filtered.head()


Unnamed: 0,Player,Tm,GS,Completion %,Yards,Touchdowns,Interceptions,Yards/Attempt
0,Jameis Winston,TAM,16.0,60.7,5109.0,33.0,30.0,8.2
1,Dak Prescott,DAL,16.0,65.1,4902.0,30.0,11.0,8.2
2,Jared Goff,LAR,16.0,62.9,4638.0,22.0,16.0,7.4
3,Philip Rivers,LAC,16.0,66.0,4615.0,23.0,20.0,7.8
4,Matt Ryan,ATL,15.0,66.2,4466.0,26.0,14.0,7.3


In [14]:
data_radar_filtered.tail()

Unnamed: 0,Player,Tm,GS,Completion %,Yards,Touchdowns,Interceptions,Yards/Attempt
27,Ryan Tannehill,TEN,10.0,70.3,2742.0,22.0,6.0,9.6
28,Matthew Stafford,DET,8.0,64.3,2499.0,19.0,5.0,8.6
30,Joe Flacco,DEN,8.0,65.3,1822.0,6.0,5.0,7.0
31,Mason Rudolph,PIT,8.0,62.2,1765.0,13.0,9.0,6.2
32,Case Keenum,WAS,8.0,64.8,1707.0,11.0,5.0,6.9
